## pacotes e configurações

In [19]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

## carregamento de dados

In [2]:
local = '/content/drive/MyDrive/Cursos/2023/Modelagem e Preparação de Dados para Aprendizado de Máquina/03 - Práticas/Aula 5/feat_select_final.csv'

In [6]:
df = pd.read_csv(local)
df.shape

In [7]:
df.sample(3)

In [8]:
df.info(verbose=True)

In [10]:
X = df.drop(['id','target'], axis=1)
y = df[['target']]
X.shape, y.shape

In [16]:
X = X[X.columns[0:50]]
X.shape

## EDA

In [12]:
from ydata_profiling import ProfileReport

In [13]:
profile = ProfileReport(df, title="Profiling Report")
profile

## correlação

In [21]:
# Análise da correlação entre as variáveis (com exclusão da variável de resposta)
corr = X.corr('spearman').round(2)
corr = abs(corr)
corr.head()

In [22]:
# Mapa de calor da correlação entre as variáveis
plt.subplots(figsize=(20, 20))
sns.heatmap(abs(corr),
            mask=np.triu(np.ones_like(corr, dtype=bool)),
            cmap='BrBG', center=0, cbar_kws={"shrink": .75},
            annot=abs(corr), annot_kws={"fontsize":8}, fmt='',
            square=True, linewidths=.5);

## variância

In [30]:
from sklearn.feature_selection import VarianceThreshold

In [37]:
selector = VarianceThreshold(threshold=0.01)
selector.fit_transform(X)

In [38]:
selector.get_feature_names_out()

## anova

In [39]:
from sklearn.feature_selection import f_classif

In [40]:
anova = f_classif(X, y)
anova

In [41]:
# Formatação das informações obtidas em um novo dataframe
imp = pd.DataFrame(X.columns, columns=['variavel'])
imp['importancia'] = anova[1]
imp

In [43]:
imp.sort_values('importancia',ascending=True)[:20]

## importância intrínseca (RF)

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
rfc = RandomForestClassifier(n_jobs=-1)
rfc.fit(X, y)

In [46]:
rfc.feature_importances_

In [47]:
imp = pd.DataFrame(X.columns, columns=['variavel'])
imp['importancia'] = rfc.feature_importances_
imp

In [48]:
imp.sort_values('importancia',ascending=False)[:20]

# permutation importance

In [56]:
from sklearn.inspection import permutation_importance

In [59]:
r = permutation_importance(rfc, X, y, n_repeats=3, random_state=0)

In [61]:
for i in r.importances_mean.argsort()[::-1]:
  if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
    print(f"{X.columns[i]:<8}"
    f"{r.importances_mean[i]:.5f}"
    f" +/- {r.importances_std[i]:.5f}")

## mutual information

In [49]:
from sklearn.feature_selection import mutual_info_classif

In [50]:
mif = mutual_info_classif(X, y, random_state = 14)

In [54]:
# Formatação para df
importance_mif = pd.Series(mif, index=X.columns)

# Análise gráfica
importance_mif.plot.bar(figsize=(10,4));

In [55]:
importance_mif.sort_values(ascending=False)[:20]