In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer


In [2]:
# Load the Breast Cancer dataset
cancer = load_breast_cancer()
X = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
y = cancer.target

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Set a threshold value for feature selection methods
threshold = 0.2

In [7]:
# 1. Information Gain
from sklearn.feature_selection import mutual_info_classif
information_gain = mutual_info_classif(X_train_scaled, y_train)
selected_features_info_gain = X_train.columns[information_gain > threshold]
print("Selected Features (Information Gain):", selected_features_info_gain)

Selected Features (Information Gain): Index(['mean radius', 'mean perimeter', 'mean area', 'mean compactness',
       'mean concavity', 'mean concave points', 'radius error',
       'perimeter error', 'area error', 'worst radius', 'worst perimeter',
       'worst area', 'worst compactness', 'worst concavity',
       'worst concave points'],
      dtype='object')


In [8]:
# 2. Chi-square Test
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
chi2_selector = SelectKBest(chi2, k='all')
chi2_selector.fit(X_train, y_train)
selected_features_chi2 = X_train.columns[chi2_selector.get_support()]
print("Selected Features (Chi-square Test):", selected_features_chi2)

Selected Features (Chi-square Test): Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')


In [9]:
# 3. Fisher’s Score
from sklearn.feature_selection import SelectKBest, f_classif
fisher_selector = SelectKBest(score_func=f_classif, k='all')
fisher_selector.fit(X_train_scaled, y_train)
selected_features_fisher = X_train.columns[fisher_selector.get_support()]
print("Selected Features (Fisher’s Score):", selected_features_fisher)


Selected Features (Fisher’s Score): Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')


In [10]:
# 4. Correlation Coefficient
X_train_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
correlation_matrix = X_train_df.corr()
correlation_with_target = correlation_matrix[y_train.astype(str)].abs()
selected_features_corr = X_train_df.columns[correlation_with_target > threshold]
print("Selected Features (Correlation Coefficient):", selected_features_corr)

#Will look at the error the next day

KeyError: "None of [Index(['1', '0', '1', '1', '1', '0', '1', '1', '1', '0',\n       ...\n       '0', '0', '1', '0', '1', '1', '1', '1', '0', '1'],\n      dtype='object', length=455)] are in the [columns]"

In [11]:
# 5. Variance Threshold
from sklearn.feature_selection import VarianceThreshold
variance_selector = VarianceThreshold(threshold=threshold)
variance_selector.fit(X_train_scaled)
selected_features_variance = X_train.columns[variance_selector.get_support()]
print("Selected Features (Variance Threshold):", selected_features_variance)

Selected Features (Variance Threshold): Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension'],
      dtype='object')


In [12]:
# 6. LASSO Regularization (L1)
from sklearn.linear_model import Lasso
lasso_selector = Lasso(alpha=0.01)
lasso_selector.fit(X_train_scaled, y_train)
selected_features_lasso = X_train.columns[lasso_selector.coef_ != 0]
print("Selected Features (LASSO Regularization):", selected_features_lasso)

Selected Features (LASSO Regularization): Index(['mean texture', 'mean concave points', 'mean fractal dimension',
       'radius error', 'smoothness error', 'compactness error',
       'concavity error', 'worst radius', 'worst texture', 'worst smoothness',
       'worst concavity', 'worst concave points', 'worst symmetry'],
      dtype='object')
