In [1]:
import warnings
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [3]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")


LOADING THE DATASET

In [4]:
print("Dataset shape:", X.shape)

Dataset shape: (569, 30)


In [5]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [6]:
X.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [7]:
print("Number of missing values in each column:\n", X.isnull().sum())


Number of missing values in each column:
 mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
dtype: int64


DETECTING MISSING VALUES.IN THIS DTASET THERE IS NO MISSING VALUES

In [9]:
# Preprocessing: Feature scaling
scaler = StandardScaler()  # Standardization scales the features to have mean=0 and variance=1
X_scaled = scaler.fit_transform(X)  # Fit and transform the features

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Verify scaled data
print("\nFirst 5 rows of scaled features:\n", pd.DataFrame(X_scaled, columns=data.feature_names).head())


First 5 rows of scaled features:
    mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0     1.097064     -2.073335        1.269934   0.984375         1.568466   
1     1.829821     -0.353632        1.685955   1.908708        -0.826962   
2     1.579888      0.456187        1.566503   1.558884         0.942210   
3    -0.768909      0.253732       -0.592687  -0.764464         3.283553   
4     1.750297     -1.151816        1.776573   1.826229         0.280372   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0          3.283515        2.652874             2.532475       2.217515   
1         -0.487072       -0.023846             0.548144       0.001392   
2          1.052926        1.363478             2.037231       0.939685   
3          3.402909        1.915897             1.451707       2.867383   
4          0.539340        1.371011             1.428493      -0.009560   

   mean fractal dimension  ...  worst radius  worst textu

### CLASSIFICATION ALGORITHM

In [10]:
# Importing necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Dictionary to store classifiers and their names
classifiers = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "Support Vector Machine (SVM)": SVC(kernel='linear', random_state=42),
    "k-Nearest Neighbors (k-NN)": KNeighborsClassifier(n_neighbors=5)
}

# Loop through each classifier, train it, and evaluate it
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    # Make predictions
    y_pred = clf.predict(X_test)
    # Evaluate performance
    print(f"\n=== {name} ===")
    print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
    print("Classification Report:\n", classification_report(y_test, y_pred))


=== Logistic Regression ===
Accuracy: 97.37%
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


=== Decision Tree ===
Accuracy: 94.74%
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.96      0.96      0.96        71

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114


=== Random Forest ===
Accuracy: 96.49%
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      

#### Classification Algorithms:

1. Logistic Regression: Linear model for binary classification. Calculates probability using sigmoid function. Suitable for linearly separable data.

2. Decision Tree Classifier: Splits data into subsets based on feature values. Handles non-linear relationships and is interpretable.

3. Random Forest Classifier: Ensemble method combining multiple decision trees. Reduces overfitting and improves generalization.

4. Support Vector Machine (SVM): Creates hyperplane to separate classes by maximizing margin. Works well with high-dimensional data.

5. k-Nearest Neighbors (k-NN): Classifies instances based on majority class among k nearest neighbors. Captures complex patterns in non-linear data.

Classification Algorithm Comparison

Logistic Regression

- Accuracy: 97.37%
- Precision: High for both classes (0 = 98%, 1 = 97%)
- Recall: High for both classes (0 = 95%, 1 = 99%)
- F1-Score: 96% (Class 0), 98% (Class 1)
Logistic regression performed the best, with high accuracy and strong precision, recall, and F1-scores. It is particularly well-suited for this dataset because it works effectively for binary classification problems with linearly separable data.

Decision Tree

- Accuracy: 94.74%
- Precision: Good for both classes (0 = 93%, 1 = 96%)
- Recall: Good for both classes (0 = 93%, 1 = 96%)
- F1-Score: 93% (Class 0), 96% (Class 1)
Decision trees performed decently but had the lowest accuracy among the models. While they are good at capturing non-linear relationships, they are prone to overfitting, which may explain the slightly lower performance compared to other models.

Random Forest

- Accuracy: 96.49%
- Precision: High for both classes (0 = 98%, 1 = 96%)
- Recall: High for both classes (0 = 93%, 1 = 99%)
- F1-Score: 95% (Class 0), 97% (Class 1)
Random forests performed almost as well as logistic regression. They handled feature interactions and reduced overfitting, which helped them achieve high precision and recall.

Support Vector Machine (SVM)

- Accuracy: 95.61%
- Precision: Good for both classes (0 = 93%, 1 = 97%)
- Recall: Good for both classes (0 = 95%, 1 = 96%)
- F1-Score: 94% (Class 0), 96% (Class 1)
SVM also performed well, with an accuracy of 95.61%. Its ability to find a decision boundary with a large margin likely contributed to its good performance.

k-Nearest Neighbors (k-NN)

- Accuracy: 94.74%
- Precision: Good for both classes (0 = 93%, 1 = 96%)
- Recall: Good for both classes (0 = 93%, 1 = 96%)
- F1-Score: 93% (Class 0), 96% (Class 1)
k-NN performed similarly to the decision tree, with an accuracy of 94.74%. While it is simple and effective, it may be sensitive to the choice of k and feature scaling. The slightly lower accuracy and F1-scores indicate it might not capture complex patterns as effectively as the other algorithms.