In [8]:
# Import packages and functions
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest

In [9]:
# Import dataset
cancer = pd.read_csv('../data/raw/breast.csv')

# Changing the names of the columns to improve their understanding
new_columns = ['ID', 'Diagnosis']
for i in range (30):       
       if cancer.columns[i+2][-1:] == '1': 
               column_type = 'mean'
       elif cancer.columns[i+2][-1:] == '2': 
               column_type = 'se'
       else: 
               column_type = 'worst'
       temp = cancer.columns[i+2][:-1] + '_' + column_type
       new_columns.append(temp)

cancer.columns = new_columns
cancer.describe()

Unnamed: 0,ID,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [10]:
# Standardize features using the z-score method
scaler = StandardScaler()

features = cancer.columns[2:]

standardized_cancer = scaler.fit_transform(cancer[features])

# Convert the standardized data back to a DataFrame
standardized_cancer = pd.DataFrame(standardized_cancer, columns=features)

standardized_cancer.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,-1.373633e-16,6.868164e-17,-1.248757e-16,-2.185325e-16,-8.366672e-16,1.873136e-16,4.995028e-17,-4.995028e-17,1.74826e-16,4.745277e-16,...,-8.241796e-16,1.248757e-17,-3.746271e-16,0.0,-2.372638e-16,-3.371644e-16,7.492542e-17,2.247763e-16,2.62239e-16,-5.744282e-16
std,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,...,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088
min,-2.029648,-2.229249,-1.984504,-1.454443,-3.112085,-1.610136,-1.114873,-1.26182,-2.744117,-1.819865,...,-1.726901,-2.223994,-1.693361,-1.222423,-2.682695,-1.443878,-1.305831,-1.745063,-2.16096,-1.601839
25%,-0.6893853,-0.7259631,-0.6919555,-0.6671955,-0.7109628,-0.747086,-0.7437479,-0.7379438,-0.7032397,-0.7226392,...,-0.6749213,-0.7486293,-0.6895783,-0.642136,-0.6912304,-0.6810833,-0.7565142,-0.7563999,-0.6418637,-0.6919118
50%,-0.2150816,-0.1046362,-0.23598,-0.2951869,-0.03489108,-0.2219405,-0.3422399,-0.3977212,-0.0716265,-0.1782793,...,-0.2690395,-0.04351564,-0.2859802,-0.341181,-0.04684277,-0.2695009,-0.2182321,-0.2234689,-0.1274095,-0.2164441
75%,0.4693926,0.5841756,0.4996769,0.3635073,0.636199,0.4938569,0.5260619,0.6469351,0.5307792,0.4709834,...,0.5220158,0.6583411,0.540279,0.357589,0.5975448,0.5396688,0.5311411,0.71251,0.4501382,0.4507624
max,3.971288,4.651889,3.97613,5.250529,4.770911,4.568425,4.243589,3.92793,4.484751,4.910919,...,4.094189,3.885905,4.287337,5.930172,3.955374,5.112877,4.700669,2.685877,6.046041,6.846856


In [11]:
# Split the dataset into test and training sets at 25-75 split.

X_train, X_test, Y_train, Y_test = train_test_split(standardized_cancer, 
                                                    cancer['Diagnosis'], 
                                                    test_size=0.25, random_state=42)

# Verify shapes
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(426, 30) (143, 30)
(426,) (143,)


In [18]:
# As part of our analysis we will need a metric to decide which feature to select in case other metrics (e.g. accuracy produces equal results)
# Therefore, we suggest to use Chi-square test to select features based on their statistical significance relative to the target variable

le = LabelEncoder()
# Apply Label Encoding to each column in your dataset if necessary
for column in standardized_cancer.columns:
    standardized_cancer[column] = le.fit_transform(standardized_cancer[column])

# Assuming 'X' contains your features and 'y' is your target variable
chi2_selector = SelectKBest(chi2, k='all')  # k='all' to compute scores for all features
chi2_selector.fit(standardized_cancer, cancer['Diagnosis'])

# Get Chi-square scores and p-values
chi2_scores = chi2_selector.scores_
p_values = chi2_selector.pvalues_

# Creating a DataFrame to summarize scores and p-values
chi2_results = pd.DataFrame({"feature": standardized_cancer.columns, "Chi2_Score": chi2_scores, "P_Value": p_values})
# Sort features by score or p-value
# significant_features = chi2_results[chi2_results["P_Value"] < 0.05].sort_values(by="Chi2_Score", ascending=False)

# print(significant_features)
chi2_results['chi2_rank'] = chi2_results['Chi2_Score'].rank(ascending=False)
chi2_results = chi2_results.sort_values(by='chi2_rank', ascending=True)
chi2_results

Unnamed: 0,feature,Chi2_Score,P_Value,chi2_rank
7,concave_points_mean,33181.619418,0.0,1.0
23,area_worst,31608.293011,0.0,2.0
27,concave_points_worst,30176.634384,0.0,3.0
6,concavity_mean,28964.980562,0.0,4.0
22,perimeter_worst,28744.082662,0.0,5.0
2,perimeter_mean,27327.235675,0.0,6.0
3,area_mean,27274.92738,0.0,7.0
26,concavity_worst,26496.098263,0.0,8.0
13,area_se,26300.763509,0.0,9.0
20,radius_worst,25680.903722,0.0,10.0


In [21]:
# 1) We suggest to follow the Forward Feature Selection method to make the decision on the most important features.
# 2) The general idea is to iteratively add features based on the KNN method performance (we'll be optimizing accuracy).
# 3) At the first step we'll run independently KNN for every feature to decide which 
# one of them produces KNN with the highest accuracy score (such feature will be selected as feature1).
# 4) At the second step we'll run independently KNN for feature1 and every other feature to decide which 
# combination of feature1 and another feature produces KNN with the highest accuracy score (this another feature is feature2).
# 5) We'll be doing this up to 15 features. We think it should be enough for our purpose 
# to limit the number of features at least two times as compared to the initial 30 features.
knn = KNeighborsClassifier()
parameter_grid = {
    "n_neighbors": range(1, 30),
}

cancer_tune_grid = GridSearchCV(
    estimator=knn,
    param_grid=parameter_grid,
    cv=10
)

# structure of the final DataFrame with the results. The results presented in every line will be for all features above
KNN_forward_results = pd.DataFrame(columns=
                                           ['feature',
                                            'opt_n_neighbors', 
                                            'accuracy', 
                                            'precision', 
                                            'recall', 
                                            'f1']
                                           )

KNN_interim_results = pd.DataFrame(columns=
                                           ['feature', 
                                            'opt_n_neighbors', 
                                            'accuracy', 
                                            'precision', 
                                            'recall', 
                                            'f1']
                                           )

stage = 1
feature_count = 0
while feature_count < 15: 
    
    if stage == 1:
        working_features = features	# list of features to used for identifing the feature that helps to produce the highest accuracy score
    else:
        working_features = pd.Index(list(set(working_features) - set(selected_features['feature'])))
        X_train_base = X_train[KNN_forward_results['feature'].tolist()] # "base" dataset to train the models (it will be iteratively expanded, see below)
        X_test_base = X_test[KNN_forward_results['feature'].tolist()] # "base" dataset to test the models (it will be iteratively expanded, see below)
        KNN_interim_results = KNN_interim_results.drop(KNN_interim_results.index)
        selected_features = selected_features.drop(selected_features.index)    

    for i in working_features:
        if stage == 1:
            X_train_exp = X_train[[i]]
            X_test_exp = X_test[[i]]
        else:
            X_train_exp = X_train_base.join(X_train[i], how='inner')
            X_test_exp = X_test_base.join(X_test[i], how='inner')

        cancer_tune_grid.fit(
            X_train_exp,
            Y_train
        )

        # Refit the KNN model with the best k value
        knn_best = KNeighborsClassifier(n_neighbors=cancer_tune_grid.best_params_['n_neighbors'])
        knn_best.fit(X_train_exp, Y_train)

        # Predict on the test set
        
        X_pred = X_test_exp
        Y_pred = knn_best.predict(X_pred)

        new_row = pd.DataFrame({
            'feature': [i],
            'opt_n_neighbors': [cancer_tune_grid.best_params_['n_neighbors']],
            'accuracy': [accuracy_score(Y_test, Y_pred)],
            'precision': [precision_score(Y_test,Y_pred,pos_label="M")],
            'recall': [recall_score(Y_test,Y_pred,pos_label="M")],
            'f1': [f1_score(Y_test,Y_pred, average='weighted')] 
        })
        if KNN_interim_results.empty: 
            KNN_interim_results = new_row
        else:
            KNN_interim_results = pd.concat([KNN_interim_results, new_row], ignore_index=True)
    
    KNN_interim_results['accuracy_rank'] = KNN_interim_results['accuracy'].rank(ascending=False)

    KNN_interim_results = pd.merge(KNN_interim_results, chi2_results[['feature','Chi2_Score']], on='feature', how='inner')
    
    interim_selection = KNN_interim_results[KNN_interim_results['accuracy_rank'] == min(KNN_interim_results['accuracy_rank'])]
    selected_features = interim_selection[interim_selection['Chi2_Score'] == max(interim_selection['Chi2_Score'])]    
    
    for j in range(len(selected_features)): # in case there could be 2 or more lines with the same accuracy scores and Chi2 scores
        new_row = pd.DataFrame({
            'feature': [selected_features.iloc[j]['feature']],
            'opt_n_neighbors': [selected_features.iloc[j]['opt_n_neighbors']],
            'accuracy': [selected_features.iloc[j]['accuracy']],
            'precision': [selected_features.iloc[j]['precision']],
            'recall': [selected_features.iloc[j]['recall']],
            'f1': [selected_features.iloc[j]['f1']]
        })
        if KNN_forward_results.empty: 
            KNN_forward_results = new_row
        else:
            KNN_forward_results = pd.concat([KNN_forward_results, new_row], ignore_index=True)
    stage += 1
    feature_count = KNN_forward_results.shape[0]

KNN_forward_results = pd.merge(KNN_forward_results, chi2_results[['feature','Chi2_Score']], on='feature', how='inner')
KNN_forward_results

Unnamed: 0,feature,opt_n_neighbors,accuracy,precision,recall,f1,Chi2_Score
0,area_worst,16,0.951049,0.979592,0.888889,0.95053,31608.293011
1,compactness_se,17,0.979021,1.0,0.944444,0.978895,7301.703489
2,concavity_mean,5,0.986014,0.981481,0.981481,0.986014,28964.980562
3,perimeter_worst,17,0.986014,0.981481,0.981481,0.986014,28744.082662
4,area_se,13,0.986014,0.981481,0.981481,0.986014,26300.763509
5,compactness_worst,9,0.993007,0.981818,1.0,0.993019,18250.856307
6,concave_points_se,4,0.986014,0.981481,0.981481,0.986014,11447.8856
7,smoothness_worst,8,0.993007,1.0,0.981481,0.992994,6043.389537
8,area_mean,5,0.993007,0.981818,1.0,0.993019,27274.92738
9,radius_worst,5,0.993007,0.981818,1.0,0.993019,25680.903722


In [27]:
# Based on the results in the table above, adding any features 
# beyond the first nine does not improve any of the provided metrics, including the accuracy score

knn = KNeighborsClassifier()
parameter_grid = {
    "n_neighbors": range(1, 100), #attention!
}

cancer_tune_grid = GridSearchCV(
    estimator=knn,
    param_grid=parameter_grid,
    cv=10 #attention!
)

parementer_list = KNN_forward_results['feature'][:9].tolist()

cancer_tune_grid.fit(
    X_train[parementer_list],
    Y_train
)

knn_best = KNeighborsClassifier(n_neighbors=cancer_tune_grid.best_params_['n_neighbors'])
knn_best.fit(X_train[parementer_list], Y_train)

# Predict on the test set
X_pred = X_test[parementer_list]
Y_pred = knn_best.predict(X_pred)

print(parementer_list)
print(f"Number of N neighbors in KNN is {cancer_tune_grid.best_params_['n_neighbors']}")
print(f"Accuracy score is {accuracy_score(Y_test, Y_pred)}")
print(f"Precision score is {precision_score(Y_test,Y_pred,pos_label='M')}")
print(f"Recall score is {recall_score(Y_test,Y_pred,pos_label='M')}")
print(f"F1 score is {f1_score(Y_test,Y_pred, average='weighted')}") 


['area_worst', 'compactness_se', 'concavity_mean', 'perimeter_worst', 'area_se', 'compactness_worst', 'concave_points_se', 'smoothness_worst', 'area_mean']
Number of N neighbors in KNN is 5
Accuracy score is 0.993006993006993
Precision score is 0.9818181818181818
Recall score is 1.0
F1 score is 0.9930193167636802
