In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("Breast Cancer Wisconsin Dataset.csv")
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None

df.drop(columns=["id"], inplace=True)
df["diagnosis"].replace(["M","B"], [1, 0], inplace=True)
df["diagnosis"].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["diagnosis"].replace(["M","B"], [1, 0], inplace=True)
  df["diagnosis"].replace(["M","B"], [1, 0], inplace=True)


array([1, 0])

In [3]:
df.columns

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'diagnosis'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [5]:
df["diagnosis"].value_counts()

diagnosis
0    357
1    212
Name: count, dtype: int64

In [6]:
mins = df.loc[:, "radius_mean" : "fractal_dimension_worst"].min()
maxs = df.loc[:, "radius_mean" : "fractal_dimension_worst"].max()

min_max_df = pd.DataFrame([mins, maxs])
min_max_df

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,0.1115,0.3602,0.757,6.802,0.001713,0.002252,0.0,0.0,0.007882,0.000895,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
1,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,2.873,4.885,21.98,542.2,0.03113,0.1354,0.396,0.05279,0.07895,0.02984,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [7]:
x = df.loc[:, "radius_mean" : "fractal_dimension_worst"]
y = df["diagnosis"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=10)

In [8]:
def test_scores(model_name, predictions):
    
    accuracy = accuracy_score(y_test, predictions)
    confusion = confusion_matrix(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    
    confusion_str = (f"True Negative: {confusion[0][0]}, "
                     f"True Positive: {confusion[1][1]}, "
                     f"False Positive: {confusion[0][1]}, "
                    f"False Negative: {confusion[1][0]}")
    
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1": f1,
        "Confusion Matrix": confusion_str
    }

In [9]:
svcPipe = Pipeline([
    ("scaler", StandardScaler()),
    ('select', SelectKBest(score_func=f_classif)),
    ('svc', SVC(class_weight="balanced"))
])

cv_SVC = GridSearchCV(svcPipe, {
    "select__k": range(10,31),
    "svc__C": [0.01, 0.1, 1, 10],
    "svc__kernel": ["linear", "rbf"]
    }, cv=10,
    scoring=['accuracy', 'precision', 'recall', 'f1'],
    refit="recall",
    n_jobs=-1)

cv_SVC.fit(x_train, y_train)

In [10]:
svc_results_df = pd.DataFrame(cv_SVC.cv_results_)

selected_cols = [
    "param_select__k", "param_svc__C", "param_svc__kernel",
    "mean_test_accuracy", "mean_test_precision", "mean_test_recall", "mean_test_f1",
    "std_test_accuracy", "std_test_precision", "std_test_recall", "std_test_f1",
    "rank_test_accuracy", "rank_test_precision", "rank_test_recall", "rank_test_f1"
]

cv_metrics = ["mean_test_accuracy", "mean_test_precision", "mean_test_recall", "mean_test_f1", "rank_test_recall"]

svc_cv_results_df = svc_results_df[cv_metrics].sort_values("rank_test_recall").head(1).drop(columns="rank_test_recall")
svc_results_df[selected_cols].sort_values("rank_test_recall").head(10)

Unnamed: 0,param_select__k,param_svc__C,param_svc__kernel,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,std_test_accuracy,std_test_precision,std_test_recall,std_test_f1,rank_test_accuracy,rank_test_precision,rank_test_recall,rank_test_f1
125,25,1.0,rbf,0.978019,0.972113,0.970588,0.970872,0.024081,0.037437,0.03946,0.032287,8,37,1,8
134,26,10.0,linear,0.973575,0.960313,0.970588,0.965154,0.029446,0.037104,0.047425,0.039515,25,65,1,22
166,30,10.0,linear,0.96256,0.934101,0.970588,0.951531,0.033003,0.048041,0.047425,0.043153,77,105,1,76
84,20,1.0,linear,0.980193,0.982604,0.965033,0.973406,0.020844,0.026666,0.038834,0.028347,1,8,4,1
117,24,1.0,rbf,0.975797,0.972113,0.964706,0.967652,0.025009,0.037437,0.047059,0.033813,14,37,5,14
100,22,1.0,linear,0.980145,0.982604,0.964706,0.973044,0.023096,0.026666,0.047059,0.031644,2,8,5,2
92,21,1.0,linear,0.980145,0.982604,0.964706,0.973044,0.023096,0.026666,0.047059,0.031644,2,8,5,2
116,24,1.0,linear,0.977971,0.977341,0.964706,0.970341,0.022128,0.027879,0.047059,0.030354,9,25,5,9
142,27,10.0,linear,0.96913,0.955168,0.964706,0.959202,0.033261,0.042496,0.059988,0.045323,53,77,5,46
167,30,10.0,rbf,0.9757,0.970799,0.964706,0.967346,0.025169,0.029329,0.047059,0.034338,17,48,5,15


In [11]:
svc_results_df["mean_test_precision"].count()

np.int64(168)

In [12]:
svc_results_df["mean_test_precision"].max()

np.float64(0.9888544891640866)

In [13]:
svc = cv_SVC.best_estimator_

svc_selected_features = svc.named_steps["select"].get_support()
x_train.columns[svc_selected_features]

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'radius_se', 'perimeter_se',
       'area_se', 'compactness_se', 'concavity_se', 'concave points_se',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [14]:
x_train.columns[svc_selected_features == False]

Index(['fractal_dimension_mean', 'texture_se', 'smoothness_se', 'symmetry_se',
       'fractal_dimension_se'],
      dtype='object')

In [15]:
svc_predictions = svc.predict(x_test)

svc_scores = test_scores("SVC", svc_predictions)
svc_scores

{'Model': 'SVC',
 'Accuracy': 0.9736842105263158,
 'Precision': 0.9285714285714286,
 'Recall': 1.0,
 'F1': 0.9629629629629629,
 'Confusion Matrix': 'True Negative: 72, True Positive: 39, False Positive: 3, False Negative: 0'}

In [16]:
lgPipe = Pipeline([
    ("scale", StandardScaler()),
    ("select", SelectKBest(f_classif)),
    ("lg", LogisticRegression(max_iter=1000, class_weight="balanced"))
                   ])

cv_LG = GridSearchCV(lgPipe, {
    "select__k": range(10,31),
    "lg__C": [0.01, 0.1, 1, 10, 100]
    }, cv=10
    , scoring=['accuracy', 'precision', 'recall', 'f1'],
    refit="f1",
    n_jobs=-1
    )

cv_LG.fit(x_train, y_train)

In [19]:
x_train.columns[lg_selected_features == False]

Index([], dtype='object')

In [20]:
lg_predictions = lg.predict(x_test)
lg_scores = test_scores("Logistic Regression", lg_predictions)

lg_scores

{'Model': 'Logistic Regression',
 'Accuracy': 0.956140350877193,
 'Precision': 0.925,
 'Recall': 0.9487179487179487,
 'F1': 0.9367088607594937,
 'Confusion Matrix': 'True Negative: 72, True Positive: 37, False Positive: 3, False Negative: 2'}

In [21]:
rf = RandomForestClassifier(criterion="entropy", class_weight="balanced", random_state=10)

rfPipe = Pipeline([
    ("select", RFE(rf)),
    ("rf", rf)
])

cv_RF = GridSearchCV(rfPipe, {
    "select__n_features_to_select": range(10,31),
    "rf__n_estimators": [100, 200, 300, 400]
    }, cv=10
    , scoring=['accuracy', 'precision', 'recall', 'f1'],
    refit="recall",
    n_jobs=-1
        )

cv_RF.fit(x_train, y_train)

In [22]:
rf_results_df = pd.DataFrame(cv_RF.cv_results_)
selected_cols = [
    "param_select__n_features_to_select", "param_rf__n_estimators",
    "mean_test_accuracy", "mean_test_precision", "mean_test_recall", "mean_test_f1",
    "std_test_accuracy", "std_test_precision", "std_test_recall", "std_test_f1",
    "rank_test_accuracy", "rank_test_precision", "rank_test_recall", "rank_test_f1"
]

rf_cv_results_df = rf_results_df[cv_metrics].sort_values("rank_test_recall").head(1).drop(columns="rank_test_recall")
rf_results_df[selected_cols].sort_values("rank_test_recall").head(10)

Unnamed: 0,param_select__n_features_to_select,param_rf__n_estimators,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,std_test_accuracy,std_test_precision,std_test_recall,std_test_f1,rank_test_accuracy,rank_test_precision,rank_test_recall,rank_test_f1
9,19,100,0.964734,0.965033,0.94183,0.953085,0.022647,0.038834,0.026339,0.02978,1,29,1,1
6,16,100,0.964638,0.965325,0.941503,0.952718,0.03023,0.046785,0.045575,0.040094,4,25,2,3
4,14,100,0.962512,0.964665,0.935948,0.949687,0.026398,0.039075,0.04129,0.035269,7,48,3,12
25,14,200,0.962512,0.964665,0.935948,0.949687,0.026398,0.039075,0.04129,0.035269,7,48,3,12
29,18,200,0.962464,0.965033,0.935948,0.949865,0.029953,0.046905,0.04129,0.039563,17,29,3,9
30,19,200,0.962512,0.964665,0.935948,0.949876,0.024456,0.039075,0.031825,0.032495,14,48,3,8
26,15,200,0.96029,0.95915,0.935948,0.947013,0.029513,0.045827,0.04129,0.039055,37,63,3,36
27,16,200,0.962512,0.964665,0.935948,0.949687,0.026398,0.039075,0.04129,0.035269,7,48,3,12
24,13,200,0.96029,0.95915,0.935948,0.947013,0.029513,0.045827,0.04129,0.039055,37,63,3,36
61,29,300,0.962464,0.965686,0.935948,0.950028,0.024512,0.045781,0.031825,0.032,17,24,3,7


In [23]:
rf = cv_RF.best_estimator_

rf_selected_features = rf.named_steps["select"].get_support()
x_train.columns[rf_selected_features]

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'compactness_mean', 'concavity_mean', 'concave points_mean',
       'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [24]:
x_train.columns[rf_selected_features == False]

Index(['smoothness_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'texture_se', 'smoothness_se', 'compactness_se', 'concavity_se',
       'concave points_se', 'symmetry_se', 'fractal_dimension_se',
       'compactness_worst'],
      dtype='object')

In [25]:
rf.fit(x_train, y_train)

rf_predictions = rf.predict(x_test)
rf_scores = test_scores("Random Forest", rf_predictions)

rf_scores

{'Model': 'Random Forest',
 'Accuracy': 0.9736842105263158,
 'Precision': 0.95,
 'Recall': 0.9743589743589743,
 'F1': 0.9620253164556962,
 'Confusion Matrix': 'True Negative: 73, True Positive: 38, False Positive: 2, False Negative: 1'}

In [26]:
rf2 = RandomForestClassifier(criterion="entropy", class_weight="balanced", random_state=10)

rf2Pipe = Pipeline([
    ("rf", rf2)
])

cv_RF2 = GridSearchCV(rf2Pipe, {
    "rf__n_estimators": [100, 200, 300, 400]
    }, cv=10
    , scoring=['accuracy', 'precision', 'recall', 'f1'],
    refit="recall",
    n_jobs=-1
        )

cv_RF2.fit(x_train, y_train)

In [27]:
rf2_results_df = pd.DataFrame(cv_RF2.cv_results_)
selected_cols = [
    "param_rf__n_estimators",
    "mean_test_accuracy", "mean_test_precision", "mean_test_recall", "mean_test_f1",
    "std_test_accuracy", "std_test_precision", "std_test_recall", "std_test_f1",
    "rank_test_accuracy", "rank_test_precision", "rank_test_recall", "rank_test_f1"
]

rf2_cv_results_df = rf2_results_df[cv_metrics].sort_values("rank_test_recall").head(1).drop(columns="rank_test_recall")
rf2_results_df[selected_cols].sort_values("rank_test_recall").head(10)

Unnamed: 0,param_rf__n_estimators,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,std_test_accuracy,std_test_precision,std_test_recall,std_test_f1,rank_test_accuracy,rank_test_precision,rank_test_recall,rank_test_f1
1,200,0.960242,0.965033,0.930065,0.946835,0.027835,0.046905,0.035535,0.036661,1,2,1,1
2,300,0.960242,0.965033,0.930065,0.946835,0.027835,0.046905,0.035535,0.036661,1,2,1,1
3,400,0.960242,0.965033,0.930065,0.946835,0.027835,0.046905,0.035535,0.036661,1,2,1,1
0,100,0.960242,0.965325,0.929739,0.946468,0.027835,0.046785,0.044118,0.036987,1,1,4,4


In [28]:
rf2 = cv_RF2.best_estimator_
rf2.fit(x_train, y_train)

rf2_predictions = rf2.predict(x_test)
rf2_scores = test_scores("Random Forest Without Feature Selection", rf2_predictions)

rf2_scores

{'Model': 'Random Forest Without Feature Selection',
 'Accuracy': 0.9736842105263158,
 'Precision': 0.95,
 'Recall': 0.9743589743589743,
 'F1': 0.9620253164556962,
 'Confusion Matrix': 'True Negative: 73, True Positive: 38, False Positive: 2, False Negative: 1'}

In [29]:
combined_scores = [svc_scores, lg_scores, rf_scores, rf2_scores]
comparative_df = pd.DataFrame(combined_scores)

comparative_df.sort_values(["Recall", "Accuracy", "Precision", "F1"], ascending=False)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1,Confusion Matrix
0,SVC,0.973684,0.928571,1.0,0.962963,"True Negative: 72, True Positive: 39, False Positive: 3, False Negative: 0"
2,Random Forest,0.973684,0.95,0.974359,0.962025,"True Negative: 73, True Positive: 38, False Positive: 2, False Negative: 1"
3,Random Forest Without Feature Selection,0.973684,0.95,0.974359,0.962025,"True Negative: 73, True Positive: 38, False Positive: 2, False Negative: 1"
1,Logistic Regression,0.95614,0.925,0.948718,0.936709,"True Negative: 72, True Positive: 37, False Positive: 3, False Negative: 2"


In [30]:
compare_cv_result_df = pd.concat([svc_cv_results_df, lg_cv_results_df, rf_cv_results_df, rf2_cv_results_df],
                                 keys=["SVC", "Logistic Regression", "Random Forest", "Random Forest Without Feature Selection"])\
                                .sort_values(["mean_test_recall", "mean_test_accuracy", "mean_test_f1"], ascending=False)

compare_cv_result_df.index = compare_cv_result_df.index.get_level_values(0)
compare_cv_result_df.sort_values(["mean_test_recall", "mean_test_accuracy", "mean_test_precision", "mean_test_recall"], ascending=False)

Unnamed: 0,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1
SVC,0.978019,0.972113,0.970588,0.970872
Logistic Regression,0.980145,0.982604,0.964706,0.973044
Random Forest,0.964734,0.965033,0.94183,0.953085
Random Forest Without Feature Selection,0.960242,0.965033,0.930065,0.946835
