## **Seleção de features**

- Seleção de features para selecionar as melhores para EDA

In [None]:
import pandas as pd

df = pd.read_csv("/content/df_final_full_v3.csv")
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,INST_DAYS_INSTALMENT_MAX,INST_DAYS_ENTRY_PAYMENT_MEAN,INST_DAYS_ENTRY_PAYMENT_MIN,INST_DAYS_ENTRY_PAYMENT_MAX,INST_AMT_INSTALMENT_MEAN,INST_AMT_INSTALMENT_MAX,INST_AMT_PAYMENT_MEAN,INST_AMT_PAYMENT_MAX,INST_NUM_INSTALMENT_NUMBER_MAX,INST_NUM_INSTALMENT_NUMBER_COUNT
0,100002.0,1.0,Cash loans,M,N,Y,0.0,202500.0,406597.5,24700.5,...,-25.0,-315.421053,-587.0,-49.0,11559.247105,53093.745,11559.247105,53093.745,19.0,19.0
1,100003.0,0.0,Cash loans,F,N,N,0.0,270000.0,1293502.5,35698.5,...,-536.0,-1385.32,-2324.0,-544.0,64754.586,560835.36,64754.586,560835.36,12.0,25.0
2,100004.0,0.0,Revolving loans,M,Y,Y,0.0,67500.0,135000.0,6750.0,...,-724.0,-761.666667,-795.0,-727.0,7096.155,10573.965,7096.155,10573.965,3.0,3.0
3,100006.0,0.0,Cash loans,F,N,Y,0.0,135000.0,312682.5,29686.5,...,-11.0,-271.625,-575.0,-12.0,62947.088438,691786.89,62947.088438,691786.89,10.0,16.0
4,100007.0,0.0,Cash loans,M,N,Y,0.0,121500.0,513000.0,21865.5,...,-14.0,-1032.242424,-2318.0,-14.0,12666.444545,22678.785,12214.060227,22678.785,17.0,66.0


In [None]:
# Define nomes das colunas alvo e identificadora
TARGET = "TARGET"
ID_COL = "SK_ID_CURR"

# Separa features (X) e variável alvo (y)
X = df.drop(columns=[TARGET, ID_COL])
y = df[TARGET]

print("Features iniciais:", X.shape[1])


Features iniciais: 205


In [None]:
# Seleciona as colunas categóricas (tipo object)
X.select_dtypes(include='object').columns

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')

In [None]:
# Seleciona apenas as features numéricas
X_num = X.select_dtypes(exclude='object')

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Inicializa o filtro de baixa variância (remove features constantes)
vt = VarianceThreshold(threshold=0.0)

# Ajusta o filtro aos dados numéricos
vt.fit(X_num)

# Identifica colunas com variância zero
low_variance_cols = X_num.columns[~vt.get_support()]

# Exibe a quantidade de features removíveis
len(low_variance_cols)


4

In [None]:
# Remove as features com variância zero
X = X.drop(columns=low_variance_cols)

print("Features após variância zero:", X.shape[1])

Features após variância zero: 201


In [None]:
# Calcula a proporção de valores nulos por feature
nan_ratio = X.isnull().mean()

# Identifica colunas com mais de 95% de valores nulos
high_nan_cols = nan_ratio[nan_ratio > 0.95].index

# Exibe a quantidade dessas colunas
len(high_nan_cols)


0

In [None]:
# Identifica colunas categóricas
cat_cols = X.select_dtypes(include='object').columns.tolist()

# Exibe quantidade de colunas categóricas e uma amostra
len(cat_cols), cat_cols[:10]

(16,
 ['NAME_CONTRACT_TYPE',
  'CODE_GENDER',
  'FLAG_OWN_CAR',
  'FLAG_OWN_REALTY',
  'NAME_TYPE_SUITE',
  'NAME_INCOME_TYPE',
  'NAME_EDUCATION_TYPE',
  'NAME_FAMILY_STATUS',
  'NAME_HOUSING_TYPE',
  'OCCUPATION_TYPE'])

In [None]:
# Converte colunas categóricas para o tipo category (otimizado para LightGBM)
for col in cat_cols:
    X[col] = X[col].astype('category')


In [None]:
from sklearn.model_selection import train_test_split

# Divide os dados em treino e validação mantendo o balanceamento da target
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(X_train.shape, X_val.shape)

(246008, 201) (61503, 201)


In [None]:
import lightgbm as lgb

# Inicializa o modelo LightGBM
model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1
)

# Treina o modelo com conjunto de validação
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
)

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.812325 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28584
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 197
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482


In [None]:
import pandas as pd

# Cria DataFrame com a importância das features
importance = pd.DataFrame({
    "feature": X_train.columns,
    "importance": model.feature_importances_
}).sort_values(by="importance", ascending=False)

importance.head(20)

Unnamed: 0,feature,importance
38,ORGANIZATION_TYPE,951
39,EXT_SOURCE_1,325
40,EXT_SOURCE_2,286
41,EXT_SOURCE_3,285
15,DAYS_BIRTH,211
7,AMT_ANNUITY,203
6,AMT_CREDIT,200
197,INST_AMT_PAYMENT_MEAN,182
157,BUREAU_DAYS_CREDIT_MAX,168
16,DAYS_EMPLOYED,164


In [None]:
# Conta quantas features tiveram importância zero
(importance["importance"] == 0).sum()

np.int64(25)

In [None]:
# Conta o número de features com importância maior que zero
len(importance[importance["importance"] > 0])

176

In [None]:
# Normaliza a importância das features
importance["importance_norm"] = (
    importance["importance"] / importance["importance"].sum()
)

# Calcula a importância acumulada
importance["importance_cum"] = importance["importance_norm"].cumsum()

importance.head(20)

Unnamed: 0,feature,importance,importance_norm,importance_cum
38,ORGANIZATION_TYPE,951,0.105667,0.105667
39,EXT_SOURCE_1,325,0.036111,0.141778
40,EXT_SOURCE_2,286,0.031778,0.173556
41,EXT_SOURCE_3,285,0.031667,0.205222
15,DAYS_BIRTH,211,0.023444,0.228667
7,AMT_ANNUITY,203,0.022556,0.251222
6,AMT_CREDIT,200,0.022222,0.273444
197,INST_AMT_PAYMENT_MEAN,182,0.020222,0.293667
157,BUREAU_DAYS_CREDIT_MAX,168,0.018667,0.312333
16,DAYS_EMPLOYED,164,0.018222,0.330556


In [None]:
# Seleciona features que somam até 80% da importância total
elected_features = importance[
    importance["importance_cum"] <= 0.80
]["feature"].tolist()

len(selected_features)


64

In [None]:
# Define o conjunto final de colunas (features + target
final_features = selected_features + ["TARGET"]

In [None]:
# Cria DataFrame reduzido com as features selecionadas
df_reduced = df[final_features]

In [None]:
# Exibe o shape do dataset reduzido
df_reduced.shape

# Verifica o balanceamento da variável alvo
df_reduced["TARGET"].value_counts()

Unnamed: 0_level_0,count
TARGET,Unnamed: 1_level_1
0.0,282686
1.0,24825


In [None]:
# Salva a lista de features selecionadas
pd.Series(selected_features).to_csv(
    "selected_features_80.csv",
    index=False
)

In [None]:
# Salva a lista de features selecionadas
df_reduced.to_csv(
    "df_final_selected_features.csv",
    index=False
)

In [None]:
df = pd.read_csv("/content/df_final_selected_features.csv")
df.head(10)

Unnamed: 0,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,AMT_ANNUITY,AMT_CREDIT,INST_AMT_PAYMENT_MEAN,BUREAU_DAYS_CREDIT_MAX,DAYS_EMPLOYED,...,PREV_AMT_APPLICATION_MEAN,BUREAU_DAYS_CREDIT_ENDDATE_MIN,PREV_DAYS_DECISION_MIN,POS_CNT_INSTALMENT_MEAN,POS_MONTHS_BALANCE_MIN,PREV_AMT_GOODS_PRICE_MEAN,POS_SK_DPD_MEAN,PREV_STATUS_APPROVED,BUREAU_DAYS_ENDDATE_FACT_MIN,TARGET
0,Business Entity Type 3,0.083037,0.262949,0.139376,-9461.0,24700.5,406597.5,11559.247105,-103.0,-637.0,...,179055.0,-1072.0,-606.0,24.0,-19.0,179055.0,0.0,1.0,-1185.0,1.0
1,School,0.311267,0.622246,0.535276,-16765.0,35698.5,1293502.5,64754.586,-606.0,-1188.0,...,435436.5,-2434.0,-2341.0,10.107143,-77.0,435436.5,0.0,3.0,-2131.0,0.0
2,Government,0.505998,0.555912,0.729567,-19046.0,6750.0,135000.0,7096.155,-408.0,-225.0,...,24282.0,-595.0,-815.0,3.75,-27.0,24282.0,0.0,1.0,-683.0,0.0
3,Business Entity Type 3,0.505998,0.650442,0.535276,-19005.0,29686.5,312682.5,62947.088438,-300.0,-3039.0,...,272203.26,-1253.0,-617.0,11.428571,-20.0,408304.89,0.0,5.0,-1241.0,0.0
4,Religion,0.505998,0.322738,0.535276,-19932.0,21865.5,513000.0,12214.060227,-1149.0,-3038.0,...,150530.25,-783.0,-2357.0,15.333333,-77.0,150530.25,0.0,6.0,-783.0,0.0
5,Other,0.505998,0.354225,0.621226,-16941.0,27517.5,490495.5,27360.502714,-78.0,-1588.0,...,155701.8,-853.0,-2536.0,11.518072,-84.0,194627.25,339.060241,4.0,-1028.0,0.0
6,Business Entity Type 3,0.774761,0.724,0.49206,-13778.0,41301.0,1560726.0,9568.531765,-239.0,-3130.0,...,76741.714286,-2152.0,-1562.0,7.875,-96.0,76741.714286,0.0,7.0,-2152.0,0.0
7,Other,0.505998,0.714279,0.540654,-18850.0,42075.0,1530000.0,27449.208,-1138.0,-449.0,...,247212.0,-928.0,-1070.0,10.0,-35.0,247212.0,0.0,1.0,-1138.0,0.0
8,XNA,0.587334,0.205747,0.751724,-20099.0,33826.5,1019610.0,11328.893654,-1309.0,365243.0,...,202732.875,-2173.0,-2508.0,19.186667,-83.0,270310.5,205.666667,3.0,-2197.0,0.0
9,Electricity,0.505998,0.746644,0.535276,-14469.0,20250.0,405000.0,10451.285625,-300.0,-2019.0,...,60930.0,-1253.0,-1673.0,19.021739,-55.0,81240.0,0.0,3.0,-1241.0,0.0


In [None]:
# Recarrega o dataset reduzido
df_reduced = pd.read_csv("/content/df_final_selected_features.csv")

# Lista apenas as features (excluindo a target)
features = [col for col in df_reduced.columns if col != "TARGET"]

# Exibe quantidade e nomes das features finais
len(features), features


(64,
 ['ORGANIZATION_TYPE',
  'EXT_SOURCE_1',
  'EXT_SOURCE_2',
  'EXT_SOURCE_3',
  'DAYS_BIRTH',
  'AMT_ANNUITY',
  'AMT_CREDIT',
  'INST_AMT_PAYMENT_MEAN',
  'BUREAU_DAYS_CREDIT_MAX',
  'DAYS_EMPLOYED',
  'DAYS_ID_PUBLISH',
  'BUREAU_DAYS_CREDIT_ENDDATE_MAX',
  'POS_CNT_INSTALMENT_FUTURE_MEAN',
  'AMT_GOODS_PRICE',
  'CREDIT_AMT_BALANCE_MEAN',
  'DAYS_REGISTRATION',
  'INST_NUM_INSTALMENT_NUMBER_MAX',
  'PREV_CNT_PAYMENT_MEAN',
  'INST_NUM_INSTALMENT_NUMBER_COUNT',
  'DAYS_LAST_PHONE_CHANGE',
  'INST_DAYS_INSTALMENT_MAX',
  'CREDIT_AMT_DRAWINGS_CURRENT_MEAN',
  'BUREAU_AMT_CREDIT_SUM_MEAN',
  'INST_DAYS_ENTRY_PAYMENT_MAX',
  'PREV_STATUS_REFUSED',
  'BUREAU_AMT_CREDIT_SUM_DEBT_MEAN',
  'OCCUPATION_TYPE',
  'BUREAU_AMT_CREDIT_SUM_DEBT_SUM',
  'BUREAU_AMT_CREDIT_SUM_MAX',
  'POS_SK_DPD_DEF_MEAN',
  'AMT_INCOME_TOTAL',
  'BUREAU_AMT_CREDIT_SUM_SUM',
  'PREV_AMT_GOODS_PRICE_MIN',
  'PREV_DAYS_DECISION_MAX',
  'INST_DAYS_INSTALMENT_MIN',
  'CODE_GENDER',
  'POS_MONTHS_BALANCE_MAX',
  'POS