In [5]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import load_breast_cancer

In [6]:
# Load preprocessed data
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
X = df
y = data.target

In [8]:
# Feature selection using correlation
correlation = X.corrwith(pd.Series(y)).abs().sort_values(ascending=False)
selected_features = correlation[correlation > 0.5].index.tolist()

In [9]:
# Create polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X[selected_features])

In [10]:
# Select top K features
selector = SelectKBest(score_func=f_regression, k=10)
X_selected = selector.fit_transform(X_poly, y)

In [12]:
selected_feature_names = poly.get_feature_names_out(input_features=selected_features)[selector.get_support()]
print("Selected features:", selected_feature_names.tolist())
print("Shape of selected features:", X_selected.shape)

Selected features: ['worst concave points', 'worst perimeter', 'mean concave points', 'worst radius', 'worst concave points^2', 'worst concave points worst perimeter', 'worst concave points worst radius', 'worst concave points mean perimeter', 'worst concave points mean radius', 'worst radius worst concavity']
Shape of selected features: (569, 10)
