In [1]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("Breast Cancer Wisconsin Dataset.csv", )

In [3]:
df.drop(columns=["id"], inplace=True)

df["diagnosis"].replace(["M","B"], [1, 0], inplace=True)
df["diagnosis"].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["diagnosis"].replace(["M","B"], [1, 0], inplace=True)
  df["diagnosis"].replace(["M","B"], [1, 0], inplace=True)


array([1, 0])

In [4]:
df.columns

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'diagnosis'],
      dtype='object')

In [5]:
x = df.loc[:, "radius_mean" : "fractal_dimension_worst"]
y = df["diagnosis"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=20, random_state=10)

In [6]:
def test_scores(model_name, predictions):
    
    accuracy = accuracy_score(y_test, predictions)
    confusion = confusion_matrix(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    
    confusion_str = (f"True Negative: {confusion[0][0]}, "
                     f"True Positive: {confusion[1][1]}, "
                     f"False Positive: {confusion[0][1]}, "
                    f"False Negative: {confusion[1][0]}")
    
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Confusion Matrix": confusion_str,
        "Precision": precision,
        "Recall": recall,
        "F1": f1
    }

In [7]:
svc = SVC()
cvSVC = GridSearchCV(svc, {
    "C": [0.01, 0.1, 1, 10],
    "kernel": ["linear", "rbf"]
}, cv=10
, scoring=['accuracy', 'precision', 'recall', 'f1'],
refit="accuracy")

cvSVC.fit(x_train, y_train)


In [8]:
pd.options.display.max_columns = None

results_df = pd.DataFrame(cvSVC.cv_results_)
results_df[["param_C","param_kernel", "mean_test_accuracy","mean_test_precision","mean_test_recall", "mean_test_f1", "std_test_accuracy", "std_test_precision", "std_test_recall", "std_test_f1", "rank_test_accuracy", "rank_test_precision", "rank_test_recall", "rank_test_f1"]].sort_values("rank_test_accuracy")

Unnamed: 0,param_C,param_kernel,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,std_test_accuracy,std_test_precision,std_test_recall,std_test_f1,rank_test_accuracy,rank_test_precision,rank_test_recall,rank_test_f1
0,0.01,linear,0.952593,0.956714,0.916905,0.935075,0.020419,0.038712,0.049732,0.028279,1,6,1,1
2,0.1,linear,0.952593,0.95858,0.916667,0.934958,0.01873,0.046861,0.055226,0.02658,1,5,2,2
4,1.0,linear,0.952593,0.958818,0.916429,0.934668,0.01873,0.046823,0.059832,0.027211,1,4,4,3
6,10.0,linear,0.950741,0.95447,0.916667,0.932566,0.032795,0.056023,0.071063,0.045021,4,7,3,4
7,10.0,rbf,0.925118,0.943642,0.847619,0.891932,0.045678,0.055343,0.088372,0.070025,5,8,5,5
5,1.0,rbf,0.914209,0.960027,0.804048,0.873268,0.039035,0.058999,0.080327,0.060886,6,3,6,6
3,0.1,rbf,0.892391,0.980714,0.726429,0.829921,0.044703,0.043805,0.110961,0.079109,7,2,7,7
1,0.01,rbf,0.817845,1.0,0.512619,0.674693,0.028185,0.0,0.07203,0.0651,8,1,8,8


In [9]:
svc = cvSVC.best_estimator_
svc_pred = svc.predict(x_test)

svc_scores = test_scores("SVC", svc_pred)
svc_scores

{'Model': 'SVC',
 'Accuracy': 1.0,
 'Confusion Matrix': 'True Negative: 13, True Positive: 7, False Positive: 0, False Negative: 0',
 'Precision': 1.0,
 'Recall': 1.0,
 'F1': 1.0}

In [10]:
lg = LogisticRegression(max_iter=1000)

cvLG = GridSearchCV(lg, {
    "C": [0.01, 0.1, 1, 10, 100]
}, cv=10
, scoring=['accuracy', 'precision', 'recall', 'f1'],
refit="accuracy")

cvLG.fit(x_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [11]:
results_df = pd.DataFrame(cvLG.cv_results_)
results_df[["param_C", "mean_test_accuracy","mean_test_precision","mean_test_recall", "mean_test_f1", "std_test_accuracy", "std_test_precision", "std_test_recall", "std_test_f1", "rank_test_accuracy", "rank_test_precision", "rank_test_recall", "rank_test_f1"]].sort_values("rank_test_accuracy")

Unnamed: 0,param_C,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,std_test_accuracy,std_test_precision,std_test_recall,std_test_f1,rank_test_accuracy,rank_test_precision,rank_test_recall,rank_test_f1
4,100.0,0.958081,0.958203,0.931429,0.942964,0.016445,0.037893,0.051108,0.022977,1,1,1,1
3,10.0,0.952559,0.954056,0.921667,0.934986,0.026255,0.049576,0.067884,0.037971,2,3,2,2
2,1.0,0.952559,0.954684,0.921429,0.934961,0.026255,0.052824,0.06778,0.037484,2,2,4,3
1,0.1,0.950707,0.94733,0.921667,0.932511,0.02625,0.042167,0.064095,0.03896,4,4,2,4
0,0.01,0.939764,0.931698,0.906667,0.917914,0.031002,0.048982,0.057299,0.044273,5,5,5,5


In [12]:
lg = cvLG.best_estimator_
lg_pred = lg.predict(x_test)

lg_scores = test_scores("Logistic Regression", lg_pred)
lg_scores

{'Model': 'Logistic Regression',
 'Accuracy': 0.95,
 'Confusion Matrix': 'True Negative: 12, True Positive: 7, False Positive: 1, False Negative: 0',
 'Precision': 0.875,
 'Recall': 1.0,
 'F1': 0.9333333333333333}

In [13]:
rf = RandomForestClassifier(criterion="entropy", random_state=10)

cvRF = GridSearchCV(rf, {
    "n_estimators": [100, 200, 300, 400]
}, cv=10
, scoring=['accuracy', 'precision', 'recall', 'f1'],
refit="accuracy")

cvRF.fit(x_train, y_train)

In [14]:
results_df = pd.DataFrame(cvRF.cv_results_)
results_df[["param_n_estimators", "mean_test_accuracy","mean_test_precision","mean_test_recall", "mean_test_f1", "std_test_accuracy", "std_test_precision", "std_test_recall", "std_test_f1", "rank_test_accuracy", "rank_test_precision", "rank_test_recall", "rank_test_f1"]].sort_values("rank_test_accuracy")

Unnamed: 0,param_n_estimators,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,std_test_accuracy,std_test_precision,std_test_recall,std_test_f1,rank_test_accuracy,rank_test_precision,rank_test_recall,rank_test_f1
0,100,0.963468,0.960542,0.940952,0.949804,0.026073,0.037506,0.053609,0.037011,1,2,1,1
1,200,0.963434,0.959957,0.940714,0.949274,0.031961,0.038558,0.066098,0.045894,2,3,2,2
2,300,0.963434,0.959957,0.940714,0.949274,0.031961,0.038558,0.066098,0.045894,2,3,2,2
3,400,0.963434,0.965512,0.935952,0.949253,0.02869,0.039902,0.063312,0.041388,2,1,4,4


In [15]:
cvRF = GridSearchCV(rf, {
    "n_estimators": [25, 50, 75, 100]
}, cv=10
, scoring=['accuracy', 'precision', 'recall', 'f1'],
refit="accuracy")

cvRF.fit(x_train, y_train)

In [16]:
results_df = pd.DataFrame(cvRF.cv_results_)
results_df[["param_n_estimators", "mean_test_accuracy","mean_test_precision","mean_test_recall", "mean_test_f1", "std_test_accuracy", "std_test_precision", "std_test_recall", "std_test_f1", "rank_test_accuracy", "rank_test_precision", "rank_test_recall", "rank_test_f1"]].sort_values("rank_test_accuracy")

Unnamed: 0,param_n_estimators,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1,std_test_accuracy,std_test_precision,std_test_recall,std_test_f1,rank_test_accuracy,rank_test_precision,rank_test_recall,rank_test_f1
3,100,0.963468,0.960542,0.940952,0.949804,0.026073,0.037506,0.053609,0.037011,1,1,1,1
2,75,0.963468,0.960542,0.940952,0.949804,0.026073,0.037506,0.053609,0.037011,1,1,1,1
1,50,0.96165,0.960215,0.935952,0.946817,0.029003,0.037659,0.063312,0.042179,3,4,4,4
0,25,0.96165,0.960542,0.93619,0.947243,0.025354,0.037506,0.05449,0.036009,3,1,3,3


In [17]:
del rf

rf = RandomForestClassifier(n_estimators=100, criterion="entropy", random_state=10)
rf.fit(x_train, y_train)

rf_pred = rf.predict(x_test)
rf_scores = test_scores("Random Forest", rf_pred)

rf_scores

{'Model': 'Random Forest',
 'Accuracy': 0.95,
 'Confusion Matrix': 'True Negative: 12, True Positive: 7, False Positive: 1, False Negative: 0',
 'Precision': 0.875,
 'Recall': 1.0,
 'F1': 0.9333333333333333}

In [18]:
pd.options.display.max_colwidth = None
combined_scores = [svc_scores, lg_scores, rf_scores]
comparative_df = pd.DataFrame(combined_scores)

comparative_df

Unnamed: 0,Model,Accuracy,Confusion Matrix,Precision,Recall,F1
0,SVC,1.0,"True Negative: 13, True Positive: 7, False Positive: 0, False Negative: 0",1.0,1.0,1.0
1,Logistic Regression,0.95,"True Negative: 12, True Positive: 7, False Positive: 1, False Negative: 0",0.875,1.0,0.933333
2,Random Forest,0.95,"True Negative: 12, True Positive: 7, False Positive: 1, False Negative: 0",0.875,1.0,0.933333
