In [33]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:,.2f}'.format

In [34]:
base = pd.read_csv('data.csv', index_col = 'Unnamed: 32')
df = base

### There are 30 features in DataFrame about Cancers, showing the record of patients that had Benign and Malignant. But, do we actually need all of the 30 features?
The purpose of this notebook is to see "which feature" is important aka has a strong regression relationship to a feature to determine whether the tumor is "Benign" or "Malignant" by using Stepwise regression

Data source: https://www.kaggle.com/datasets/reihanenamdari/breast-cancer

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 569 entries, nan to nan
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 

### convert string into numeric

In [36]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
,842302,M,17.99,10.38,122.8,1001.0,0.12,0.28,0.3,0.15,0.24,0.08,1.09,0.91,8.59,153.4,0.01,0.05,0.05,0.02,0.03,0.01,25.38,17.33,184.6,2019.0,0.16,0.67,0.71,0.27,0.46,0.12
,842517,M,20.57,17.77,132.9,1326.0,0.08,0.08,0.09,0.07,0.18,0.06,0.54,0.73,3.4,74.08,0.01,0.01,0.02,0.01,0.01,0.0,24.99,23.41,158.8,1956.0,0.12,0.19,0.24,0.19,0.28,0.09
,84300903,M,19.69,21.25,130.0,1203.0,0.11,0.16,0.2,0.13,0.21,0.06,0.75,0.79,4.58,94.03,0.01,0.04,0.04,0.02,0.02,0.0,23.57,25.53,152.5,1709.0,0.14,0.42,0.45,0.24,0.36,0.09
,84348301,M,11.42,20.38,77.58,386.1,0.14,0.28,0.24,0.11,0.26,0.1,0.5,1.16,3.44,27.23,0.01,0.07,0.06,0.02,0.06,0.01,14.91,26.5,98.87,567.7,0.21,0.87,0.69,0.26,0.66,0.17
,84358402,M,20.29,14.34,135.1,1297.0,0.1,0.13,0.2,0.1,0.18,0.06,0.76,0.78,5.44,94.44,0.01,0.02,0.06,0.02,0.02,0.01,22.54,16.67,152.2,1575.0,0.14,0.2,0.4,0.16,0.24,0.08


In [37]:
uniques = df['diagnosis'].unique()
value_to_numeric = {string: i+1 for i, string in enumerate(uniques)}
df['diagnosis'] = df['diagnosis'].map(value_to_numeric)

In [38]:
exclude_columns = ['id', 'diagnosis']

distinct_columns = set(column for column in df.columns if column not in exclude_columns)

x_column = list(distinct_columns)
y = df['diagnosis']

### bi directional stepwise regression
I'm kind of person who loves analogy, let's imagine you're preparing for your trip:
1. You start with all items that you might need for your trip
2. Stepwise forward selection = you decide to add items that you think might be useful for your trip(sunscreen, smartphone, charger, powerbank, swimsuit, extra shoes, etc)
3. Stepwise backward selection = After packing some items, you realize that you may overpacked, then you decided to remove some things, for example you reconsider to bring extra shoes
4. the output = in the end, you only bring the most important and useful items for the trip

- forward is a process to add feature 1 by 1 based on the significant level(we can use the low p-value / p-value < 0.05)
- backward is a process to make sure that the features are actually imporant by taking only lowest p-value. Basically it removes less essential features to prevent overfitting

Overall, I used p-value to determine which feature is important

In [87]:
import pandas as pd
import statsmodels.api as sm

def stepwise_selection(data, target, SL_in=0.05, SL_out=0.05):
    initial_features = data.columns.tolist()
    best_features = []

    while len(initial_features) > 0:
        remaining_features = list(set(initial_features) - set(best_features))
        new_pval = pd.Series(index=remaining_features)

        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features + [new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]

        min_p_value = new_pval.min()

        if min_p_value < SL_in:
            best_feature = new_pval.idxmin()

            # Forward selection
            best_features.append(best_feature)

            # Backward elimination
            while len(best_features) > 0:
                best_features_with_constant = sm.add_constant(data[best_features])
                model_backward = sm.OLS(target, best_features_with_constant).fit()
                p_values = model_backward.pvalues[1:]
                max_p_value = p_values.max()

                if max_p_value >= SL_out:
                    excluded_feature = p_values.idxmax()
                    best_features.remove(excluded_feature)
                else:
                    break
        else:
            break

    # Final model
    final_model = sm.OLS(target, sm.add_constant(data[best_features])).fit()

    return final_model

result_model = stepwise_selection(X, y)
print(result_model.summary())


                            OLS Regression Results                            
Dep. Variable:              diagnosis   R-squared:                       0.766
Model:                            OLS   Adj. R-squared:                  0.761
Method:                 Least Squares   F-statistic:                     165.5
Date:                Wed, 15 Nov 2023   Prob (F-statistic):          1.80e-167
Time:                        09:37:33   Log-Likelihood:                 19.001
No. Observations:                 569   AIC:                            -14.00
Df Residuals:                     557   BIC:                             38.12
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

Other than p-value, there are some parameters that are important to be considered within the OLS summary:
- Adj R.squared        : you've chosen items that covers 76.1% of your need
- F-statistics         : 165.5 is a high score, showing that you're not packing randomly
- AIC                  : the lower the better, the packed items are meeting your needs and you've avoid unnecessary stuff
- BIC                  : 38.12 its good, because the packing not only effective but also simple and straightforward

Final result: you've chosen the items that you needed the most

Reference:
- https://maniksonituts.medium.com/how-to-build-a-machine-learning-model-715d7ecb3d02
- https://medium.com/@abhishek.km23/methods-of-feature-selection-3b4c88f0e2d5