# Feature Selection


In [None]:
from sklearn.feature_selection import VarianceThreshold, chi2,\
  mutual_info_regression, SelectKBest
import pandas as pd
import os
import ta
import matplotlib.pyplot as plt


os.chdir("../")

In [None]:
df = pd.read_parquet('data/bronze/crypto_stock_bronze_btc.parquet')
df.head()

In [None]:
df_features_indicators = ta.add_all_ta_features(df, open="last_open", high="last_max", low="last_min", close="last_max", volume="volume")
df_features_indicators

In [None]:
plt.figure(figsize = (15,6))
((df_features_indicators.isna().sum()\
  [df_features_indicators.isna().sum()>0]/df_features_indicators.shape[0]) * 100)\
.plot(kind='bar', title='Percentage of missing values per feature')
plt.grid(axis = 'y');


In [None]:
select_columns = list(set(df_features_indicators.columns) - set(["trend_psar_up", "trend_psar_down"]))

In [None]:
df_features_indicators_not_null = df_features_indicators[select_columns].dropna().copy()
df_features_indicators_not_null

In [None]:
df_features_indicators_not_null.set_index("date").last_close.plot(figsize = (15,6))

In [None]:
df_features_indicators_not_null.set_index("date", inplace = True,drop=True)

In [None]:
X = df_features_indicators_not_null.drop(columns = ["last_close"])
y = df_features_indicators_not_null.last_close

In [None]:
pd.DataFrame(y).describe()

In [None]:
threshold = 0.1
vt = VarianceThreshold(threshold)
X_vt = vt.fit_transform(X)


print('# vars original: {0}, # vars com threshold: {1}'.format(X.shape[1], X_vt.shape[1]))

fig, ax = plt.subplots()
ax.bar(range(len(vt.variances_)), vt.variances_)
ax.set_yscale('log')
ax.plot([0, len(vt.variances_)], [threshold, threshold], "k--")

print(vt.variances_)

In [None]:
# Initialize SelectKBest with mutual_info_regression
skb_mi = SelectKBest(score_func=mutual_info_regression, k=10)  # Select top 10 features

# Fit the SelectKBest to the data
X_selected = skb_mi.fit_transform(X, y)

# Get the indices of the selected features
selected_indices = skb_mi.get_support(indices=True)

# Print the indices of the selected features
print("Indices of selected features:", selected_indices)

In [None]:
X = X[X.columns[selected_indices]].copy()

In [None]:
pd.concat([X,y],axis=1).corr()

In [None]:
sns.heatmap(pd.concat([X,y],axis=1).corr());