In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer

In [152]:
accuracy=[]
models = ["Logistic Regression","Decision Tree", "Random Forest","SVM","k-NN"]

In [153]:
data = load_breast_cancer()

In [154]:
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

In [155]:
X.shape

(569, 30)

In [156]:
X

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [157]:
y

Unnamed: 0,target
0,0
1,0
2,0
3,0
4,0
...,...
564,0
565,0
566,0
567,0


In [158]:
X.isnull().sum()


Unnamed: 0,0
mean radius,0
mean texture,0
mean perimeter,0
mean area,0
mean smoothness,0
mean compactness,0
mean concavity,0
mean concave points,0
mean symmetry,0
mean fractal dimension,0


In [159]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [161]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

The dataset was loaded using sklearn.datasets.load_breast_cancer().
The missing values are checked, but the dataset doesn't contain missing values,thus no imputation required. Feature scaling was performed using StandardScaler, which standardize the feature to have zero mean and unit variance.

Feature scaling is necessary because algorithms such as Logistic Regression, SVM, and k-NN are sensitive to the magnitude of features. Without scaling, features with larger ranges would dominate distance calculations and optimization, leading to biased or poor model performance.


Finally, the dataset was split into training and test sets to evaluate how well the models generalize to unseen data and to prevent overfitting.



**LOGISTIC REGRESSION**

In [162]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [163]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)

In [164]:
lr_acc = accuracy_score(y_test, lr_pred)
accuracy.append(lr_acc)
lr_acc

0.9824561403508771

In [165]:
print("Classification Report - Logistic Regression\n")
print(classification_report(y_test, lr_pred))

Classification Report - Logistic Regression

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



Logistic regression is a statistical method used for predicting categorical outcomes.It is a linear model that estimates the probability of a binary outcome using a sigmoid function. It works well for linearly seperable data and provides good results.

**DECISION** **TREE**

In [166]:
from sklearn.tree import DecisionTreeClassifier

In [167]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

In [168]:
dt_acc = accuracy_score(y_test, dt_pred)
accuracy.append(dt_acc)
dt_acc

0.9122807017543859

In [169]:
print("Classification Report - Decision Tree\n")
print(classification_report(y_test, dt_pred))

Classification Report - Decision Tree

              precision    recall  f1-score   support

           0       0.85      0.93      0.89        42
           1       0.96      0.90      0.93        72

    accuracy                           0.91       114
   macro avg       0.90      0.92      0.91       114
weighted avg       0.92      0.91      0.91       114



DecisionTreeClassifier is a supervise learning algorithm used for classification which splits the data into branches based on feature values. They create a tree-like structure of decisions. It can capture non-linear relationships and is easy to interpret.

**RANDOM FOREST**

In [170]:
from sklearn.ensemble import RandomForestClassifier

In [171]:
rf=RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [172]:
rf_acc = accuracy_score(y_test, rf_pred)
accuracy.append(rf_acc)
rf_acc

0.956140350877193

In [173]:
print("Classification Report - Random Forest\n")
print(classification_report(y_test, rf_pred))

Classification Report - Random Forest

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



Random Forest is an ensemble method that combines multiple decision trees trained on random subsets of data and features.They improves accuracy and reduce overfitting, making it highly suitable for this dataset.

It performs well on complex datasets and handles feature interactions effectively.

**SVM**

In [174]:
from sklearn.svm import SVC

In [175]:
svm=SVC(kernel='rbf')
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)

In [176]:
svm_acc = accuracy_score(y_test, svm_pred)
accuracy.append(svm_acc)
svm_acc

0.9824561403508771

In [177]:
print("Classification Report - SVM\n")
print(classification_report(y_test, svm_pred))

Classification Report - SVM

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        42
           1       0.99      0.99      0.99        72

    accuracy                           0.98       114
   macro avg       0.98      0.98      0.98       114
weighted avg       0.98      0.98      0.98       114



SVM(Supporting vector machines) is a margin-based, discriminative classifier that finds an optimal seperating hyperplane such that minimum distance between the hyperplane and the closest data point of all class is maximized.
SVM finds the optimal hyperplane that maximizes the margin between classes.

It is highly effective for high-dimensional datasets like this one.

**K_NN**

In [178]:
from sklearn.neighbors import KNeighborsClassifier

In [179]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_pred = knn.predict(X_test_scaled)

In [180]:
knn_acc = accuracy_score(y_test, knn_pred)
accuracy.append(knn_acc)
knn_acc

0.956140350877193

In [181]:
print("Classification Report - k-NN\n")
print(classification_report(y_test, knn_pred))

Classification Report - k-NN

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



K-Nearest Neighbours is a supervised algorithm used for classification in which, a datapoint is classified based on majority class of its nearest neighbour.
k-NN classifies data based on the majority class among its nearest neighbors.

It works well when similar data points belong to the same class, which is common in medical datasets.

In [182]:
for i, model in enumerate(models):
    print(model,":", accuracy[i])

Logistic Regression : 0.9824561403508771
Decision Tree : 0.9122807017543859
Random Forest : 0.956140350877193
SVM : 0.9824561403508771
k-NN : 0.956140350877193


SVM is the best model, because it has hughest accuracy among all other models. Also it has better recall and precision. Thus it works well with high-dimensional data

From the analysis it is noted that the worst model for this dataset is Decision Tree. Its accuracy id 0.91 which is very less compared to the other models.