In [1]:
#1.Loading and Preprocessing
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.preprocessing import StandardScaler
#Loading the dataset
data=load_breast_cancer()
X=pd.DataFrame(data.data,columns=data.feature_names)
y=pd.Series(data.target)
#Checking for any missing values
print("Missing values:\n",X.isnull().sum().sum())
#Feature Scaling
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
print("Data loaded and scaled successfully.")

Missing values:
 0
Data loaded and scaled successfully.


In [3]:
#Explanation for the above step 
#Missing values:Checked and found that no missing values exist in this dataset.
#Feature Scaling:Used StandardScaler method to normalize the data.It is required especially for SVM and K-NN, which are sensitive to scale.

In [5]:
#2.Classification Algorithm Implementation
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
#Training test splitting
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)
#Defining the models
models={
    "Logistic Regression":LogisticRegression(max_iter=1000),
    "Decision Tree":DecisionTreeClassifier(random_state=42),
    "Random Forest":RandomForestClassifier(random_state=42),
    "SVM":SVC(),
    "K-NN":KNeighborsClassifier()
}
results=[]
#Training and evaluating each model
for name,model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    acc=accuracy_score(y_test,y_pred)
    prec=precision_score(y_test,y_pred)
    rec=recall_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    results.append({
        "Model":name,
        "Accuracy":acc,
        "Precision":prec,
        "Recall":rec,
        "F1 Score":f1
    })
results_df=pd.DataFrame(results)
print(results_df.sort_values(by="Accuracy",ascending=False))

                 Model  Accuracy  Precision    Recall  F1 Score
0  Logistic Regression  0.973684   0.972222  0.985915  0.979021
3                  SVM  0.973684   0.972222  0.985915  0.979021
2        Random Forest  0.964912   0.958904  0.985915  0.972222
1        Decision Tree  0.947368   0.957746  0.957746  0.957746
4                 K-NN  0.947368   0.957746  0.957746  0.957746


In [7]:
#Model Description
#Logistic Regression:A linear model for binary classification which is fast and interpretable.
#Decision Tree:Splits the data into several branches based on feature thresholds and is easy to interpret but may overfit.
#Random Forest:Ensemble of decision trees and reduces overfitting and performs well on many tasks.
#Support Vector Machine(SVM):Finds a hyperplane that can best separates classes ans is effective in high-dimensional space.
#KNN(k-Nearest Neighbors):Classifies based on majority vote from k nearest neighbours. It is simple,but slower on large datasets.

In [9]:
#3.Model Comparison
#Best Performing algorithm:Random Forest or SVM.
#Worst Performing algorithm: Often Decision Tree or KNN,which is depending on overfitting or scaling sensitivity.