Refrence:
- https://machinelearningmastery.com/feature-selection-with-categorical-data/
- https://www.youtube.com/watch?v=xlHk4okO8Ls
- https://towardsdatascience.com/multi-class-metrics-made-simple-part-ii-the-f1-score-ebe8b2c2ca1

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE, RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.linear_model import LogisticRegression # For categorical outputs only (classification)
from sklearn.metrics import classification_report, confusion_matrix

from plotly.offline import init_notebook_mode, iplot
import plotly.express as px
init_notebook_mode(connected=True)

# Data Loading

In [None]:
# Load dataset
df = pd.read_csv('resources/data/breast-cancer.csv', header=None,
                 names=['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps',
                        'deg_malig', 'breast', 'breast_quad', 'irradiat', 'class'])
df.shape

In [None]:
df.isna().any()

In [None]:
df.dropna(inplace=True)
df.shape

# Categorical Values Distribution

In [None]:
df['age'].value_counts()

In [None]:
df['menopause'].value_counts()

In [None]:
df['tumor_size'].value_counts()

In [None]:
df['inv_nodes'].value_counts()

In [None]:
df['node_caps'].value_counts()

In [None]:
df['deg_malig'].value_counts()

In [None]:
df['breast'].value_counts()

In [None]:
df['breast_quad'].value_counts()

In [None]:
df['irradiat'].value_counts()

In [None]:
df['class'].value_counts()

# Categorical Encoding

In [None]:
x_columns = [x for x in df.columns if x != 'class']
X = df[x_columns].values
y = df['class'].values

X = OrdinalEncoder().fit_transform(X)
y = LabelEncoder().fit_transform(y)

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

print('Train:', X_train.shape, y_train.shape)
print('Test:', X_test.shape, y_test.shape)

# Evaluation

In [None]:
def evaluate(model, x, y):
    y_pred = model.predict(x)
    cofmat_df = pd.DataFrame(confusion_matrix(y, y_pred))
    cofmat_df.index.name = 'True'
    cofmat_df.columns.name = 'Pred'
    
    print(cofmat_df)
    print()
    print(classification_report(y, y_pred, digits=5))

In [None]:
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train, y_train)
evaluate(lr_model, X_test, y_test)

# Feature Selection

### Chi-Square

- Pearson’s chi-squared statistical hypothesis test is an example of a test for independence between categorical variables.
- The results of this test can be used for feature selection, where those features that are independent of the target variable can be removed from the dataset.
- Reference:
  - https://machinelearningmastery.com/chi-squared-test-for-machine-learning/
  - https://www.statisticshowto.datasciencecentral.com/probability-and-statistics/chi-square/

In [None]:
fs = SelectKBest(score_func=chi2, k='all')
fs.fit(X_train, y_train)

tmp_df = pd.DataFrame({
    'feature': x_columns,
    'score': fs.scores_
})
fig = px.bar(tmp_df, x='feature', y='score')
fig.update_layout(height=300)
fig.show()

# Feature inv_nodes is most relevant as it's having highest score, followed by node_caps, irradiat, deg_malig, tumor_size, and breast_quad etc.
# For experiment purpose, we select top 6 features
fs = SelectKBest(score_func=chi2, k=6)
fs.fit(X_train, y_train)

X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)

X_train_fs.shape, X_test_fs.shape

In [None]:
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train_fs, y_train)
evaluate(lr_model, X_test_fs, y_test)

### Recursive Feature Elimination

- Removes N weakest features recursively, until the specified number of features is reached.

In [None]:
lr_model = LogisticRegression()
rfe = RFE(estimator=lr_model, step=1)
rfe = rfe.fit(X_train, y_train)

tmp_df = pd.DataFrame({
    'feature': x_columns,
    'ranking': rfe.ranking_
})
fig = px.bar(tmp_df, x='feature', y='ranking')
fig.update_layout(height=300)
fig.show()

# Feature inv_nodes, deg_malig, breast, and irradiat are most useful features as they fall under rank 1
# For experiment purpose, we select features with rank 1
rfe = RFE(estimator=lr_model, step=1, n_features_to_select=4)
rfe = rfe.fit(X_train, y_train)

X_train_fs = rfe.transform(X_train)
X_test_fs = rfe.transform(X_test)

X_train_fs.shape, X_test_fs.shape

In [None]:
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train_fs, y_train)
evaluate(lr_model, X_test_fs, y_test)

### Recursive Feature Elimination with Cross Validation

In [None]:
# Cross-Validation
# Reference: https://www.youtube.com/watch?v=7062skdX05Y
lr_model = LogisticRegression()
rfecv = RFECV(estimator=lr_model, step=1, scoring='accuracy', cv=StratifiedKFold(2))
rfecv = rfecv.fit(X_train, y_train)

tmp_df = pd.DataFrame({
    'feature': x_columns,
    'ranking': rfecv.ranking_
})
fig = px.bar(tmp_df, x='feature', y='ranking')
fig.update_layout(height=300)
fig.show()

tmp_df = pd.DataFrame({
    'n_feature': [x+1 for x in range(len(rfecv.grid_scores_))],
    'grid_score': rfecv.grid_scores_
})
fig = px.line(tmp_df, x='n_feature', y='grid_score')
fig.update_layout(height=300)
fig.show()

print(f'Optimal Number of features: {rfecv.n_features_}')

X_train_fs = rfecv.transform(X_train)
X_test_fs = rfecv.transform(X_test)

X_train_fs.shape, X_test_fs.shape

In [None]:
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train_fs, y_train)
evaluate(lr_model, X_test_fs, y_test)

### Extra Tree Classifier

In [None]:
etc = ExtraTreesClassifier()
etc = etc.fit(X_train, y_train)

tmp_df = pd.DataFrame({
    'feature': x_columns,
    'importance': etc.feature_importances_
})

fig = px.bar(tmp_df, x='feature', y='importance')
fig.update_layout(height=300)
fig.show()


top_indexes = tmp_df['importance'].nlargest(4).index
X_train_fs = X_train[:, top_indexes]
X_test_fs = X_test[:, top_indexes]

X_train_fs.shape, X_test_fs.shape

In [None]:
lr_model = LogisticRegression()
lr_model = lr_model.fit(X_train_fs, y_train)
evaluate(lr_model, X_test_fs, y_test)