In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt

In [None]:
column_names = [
    "Sample_code_number", "Clump_Thickness", "Uniformity_of_Cell_Size", 
    "Uniformity_of_Cell_Shape", "Marginal_Adhesion", "Single_Epithelial_Cell_Size",
    "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses", "Class"
]
dt=pd.read_csv("/kaggle/input/breast-cancer-wisconsin-state/breast-cancer-wisconsin.data",names=column_names)
dt.head()

# Data Preprocessing

In [None]:
dt.info()

Ok there is no null values but how **Bare_Nuclei** datatype is object? let's fix it

In [None]:
dt['Bare_Nuclei'].value_counts()

In [None]:
dt['Bare_Nuclei']=dt['Bare_Nuclei'].replace('?',np.nan)
dt['Bare_Nuclei'] = pd.to_numeric(dt['Bare_Nuclei'], errors='coerce')
dt.fillna(dt.median(), inplace=True)
dt['Bare_Nuclei']=dt['Bare_Nuclei'].astype('int')

In [None]:
dt.duplicated().sum()

In [None]:
dt.drop_duplicates(inplace=True)

In [None]:
dt["Class"] = dt["Class"].replace({2:0,4:1})

In [None]:
dt.drop(columns='Sample_code_number',inplace = True)

# EDA

In [None]:
#Visualization to show Class Attrition in Counts.
plt.figure(figsize = (17,6))
plt.subplot(1,2,1)
Class_rate = dt["Class"].value_counts()
sns.barplot(x = Class_rate.index, y = Class_rate.values, palette={1: 'blue', 0: 'red'})
plt.title("Class Attrition Counts", fontweight="black", size=14, pad=15)
for i, v in enumerate(Class_rate.values):
    plt.text(i, v, v,ha="center", fontsize=14)

#Visualization to show Class Attrition in Percentage.
plt.subplot(1,2,2)
colors = sns.color_palette(['red', 'blue'], len(Class_rate))
plt.pie(Class_rate, labels=[0,1], autopct="%.2f%%", textprops={"size":14},
        colors = colors,explode=[0,0.1],startangle=90)
center_circle = plt.Circle((0, 0), 0.3, fc='white')
fig = plt.gcf()
fig.gca().add_artist(center_circle)
plt.title("Class Rate",fontweight="black",size=14 ,pad=15)
plt.show()

 Our data is **not extremely imbalanced** but we will handle this in our models

In [None]:
#  Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(dt.corr(), annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
corr_values=dt.corr()['Class'].drop(['Class'])
print(corr_values.sort_values(ascending=False))

**Outliers handling**

In [None]:
sns.boxplot(data=dt, orient='h')

In [None]:
columns = ["Clump_Thickness", "Uniformity_of_Cell_Size", 
    "Uniformity_of_Cell_Shape", "Marginal_Adhesion", "Single_Epithelial_Cell_Size",
    "Bare_Nuclei", "Bland_Chromatin", "Normal_Nucleoli", "Mitoses"]
fig, axes = plt.subplots(nrows=3, ncols=3,figsize=(13,8))
fig.suptitle('Features vs Class\n', size = 18)
x=0
y=0
for i in columns :
  axes[x,y].hist(dt[i], bins=60, linewidth=0.5, edgecolor="white")
  ttl=i + " distribution"
  axes[x,y].set_title(ttl)
  if y<=1:
   y=y+1
  else:
   y=0
   x+=1
plt.tight_layout()

In [None]:
dt.describe()

As we can see the outliers here **are not data errors**, and cause it's a medical data maybe these outliers are valid cases, and most of the feature with outliers are strongly affect the data so we will not remove them, but what about the **Mitoses** feature? It doesn't really affect the classes so let's explore its outliers and deside what will we do

In [None]:
import seaborn as sns
sns.boxplot(x='Class', y='Mitoses', data=dt)

as we can see outliers are not noise ,they are biological signals so we can handle this by scaling instade of removing them

# Modeling

**Data split**

In [None]:
x=dt.drop('Class',axis=1)
y=dt['Class']

In [None]:
from sklearn.model_selection import train_test_split
x_Train,x_Test,y_Train,y_Test=train_test_split(x,y,test_size=0.3, random_state=42)

**modeling & scaling**

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix ,ConfusionMatrixDisplay,classification_report
from sklearn.preprocessing import RobustScaler

In [None]:
estimator = []
estimator.append(('LogisticRegression', LogisticRegression(C=0.01,class_weight={0:1, 1:3}, random_state=42)))
estimator.append(('RandomForest', RandomForestClassifier(max_depth=6, min_samples_leaf=5, max_features='sqrt',class_weight='balanced', random_state=42) ))
estimator.append(('SVC', SVC(kernel='rbf', C=0.01, gamma='scale',class_weight={0: 1, 1: 3}, random_state=42, probability = True)))
estimator.append(('AdaBoostClassifier', AdaBoostClassifier() ))
estimator.append(('GradientBoostingClassifier', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, subsample=0.8) ))
estimator.append(('DecisionTreeClassifier',DecisionTreeClassifier(class_weight='balanced', random_state=42)))

In [None]:
pip install "imbalanced-learn<0.10.0" "scikit-learn<1.3.0"

**impalaced data handling**

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
x_osampled, y_osampled = ros.fit_resample(x_Train, y_Train)
print("Before Smoote" , y.value_counts())
print("After Smoote" , y_osampled.value_counts())

# Hard voting

In [None]:
from sklearn.pipeline import make_pipeline
result=[]
pipeline = make_pipeline(
RobustScaler(),
VotingClassifier(estimators=estimator, voting='hard'))
pipeline.fit(x_osampled, y_osampled) 
y_pred_V = pipeline.predict(x_Test)
train_pred_V = pipeline.predict(x_osampled)
result.append({
            "Model-Name": "VotingClassifier",
            "Test_Accuracy": accuracy_score(y_Test, y_pred_V) * 100,
            "Train_Accuracy": accuracy_score(y_osampled,train_pred_V) * 100,
            "ROC_AUC": roc_auc_score(y_Test, y_pred_V),
            "F1_Score": f1_score(y_Test, y_pred_V),
            "Recall": recall_score(y_Test, y_pred_V),
            "Precision": precision_score(y_Test, y_pred_V)
        })

In [None]:
def evaluation(x_train, y_train, x_test, y_test, models):
    
    for model in models:
        pipeline = make_pipeline(
           RobustScaler(),
           model[1])
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        train_pred = pipeline.predict(x_train)
        result.append({
            "Model-Name": model[0],
            "Test_Accuracy": accuracy_score(y_test, y_pred) * 100,
            "Train_Accuracy": accuracy_score(y_train,train_pred) * 100,
            "ROC_AUC": roc_auc_score(y_test, y_pred),
            "F1_Score": f1_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred)
        })
    models_res = pd.DataFrame(result)
    models_res = models_res.set_index('Model-Name')
    return models_res.sort_values("F1_Score", ascending=False)

In [None]:
model_evaluation=evaluation(x_osampled, y_osampled,x_Test,y_Test,estimator)
model_evaluation

as we can see **VotingClassifier** is performing better than the best one in the estimators

In [None]:
cm = confusion_matrix(y_Test, y_pred_V)
# Confusion matrix
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.heatmap(confusion_matrix(y_Test, y_pred_V), annot=True, fmt='d', cmap='Blues')
plt.title("Hard voting - Confusion Matrix")
plt.show()

The most important thing that the model don't classify many **false negatives** and that is what the model did

In [None]:
print(classification_report(y_Test, y_pred_V))

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
    pipeline, x_osampled, y_osampled, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))
train_mean = np.mean(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_mean, label='Training Accuracy')
plt.plot(train_sizes, test_mean, label='Validation Accuracy')
plt.xlabel('Training Size')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Validation accuracy improves with data and it generalizes well

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(pipeline, x_osampled, y_osampled, cv=5)
print(f"Mean CV Accuracy: {scores.mean():.4f} (±{scores.std():.4f})")

# soft voting

In [None]:
S_res=[]
pipeline = make_pipeline(
RobustScaler(),
VotingClassifier(estimators=estimator, voting='soft'))
pipeline.fit(x_osampled, y_osampled) 
y_pred_s = pipeline.predict(x_Test)
train_pred_s = pipeline.predict(x_osampled)
S_res.append({
            "Model-Name": "VotingClassifier_Soft",
            "Test_Accuracy": accuracy_score(y_Test, y_pred_s) * 100,
            "Train_Accuracy": accuracy_score(y_osampled,train_pred_s) * 100,
            "ROC_AUC": roc_auc_score(y_Test, y_pred_s),
            "F1_Score": f1_score(y_Test, y_pred_s),
            "Recall": recall_score(y_Test, y_pred_s),
            "Precision": precision_score(y_Test, y_pred_s)
        })

In [None]:
S_model_res = pd.DataFrame(S_res)
S_model_res = S_model_res.set_index('Model-Name')
S_model_res

It's close to the Hard one but it generalizes better, so we have less overfitting

In [None]:
from sklearn.ensemble import BaggingClassifier
res=[]
pipeline = make_pipeline(
RobustScaler(),
BaggingClassifier(
 SVC(kernel='rbf', C=0.01, gamma='scale',class_weight={0: 1, 1: 3}, random_state=42, probability = True), n_estimators=500,
 max_samples=100, bootstrap=True, n_jobs=-1))
pipeline.fit(x_osampled, y_osampled) 
y_pred_B = pipeline.predict(x_Test)
train_pred_B = pipeline.predict(x_osampled)
res.append({
            "Model-Name": "VotingClassifier_Bagging",
            "Test_Accuracy": accuracy_score(y_Test, y_pred_B) * 100,
            "Train_Accuracy": accuracy_score(y_osampled,train_pred_B) * 100,
            "ROC_AUC": roc_auc_score(y_Test, y_pred_B),
            "F1_Score": f1_score(y_Test, y_pred_B),
            "Recall": recall_score(y_Test, y_pred_B),
            "Precision": precision_score(y_Test, y_pred_B)
        })
res = pd.DataFrame(res)

# Bagging

In [None]:
B_model_res = pd.DataFrame(res)
B_model_res = B_model_res.set_index('Model-Name')
B_model_res

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.heatmap(confusion_matrix(y_Test, y_pred_B), annot=True, fmt='d', cmap='Blues')
plt.title("Bagging - Confusion Matrix")
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
    pipeline, x_osampled, y_osampled, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))
train_mean_B = np.mean(train_scores, axis=1)
test_mean_B = np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_mean_B, label='Training Accuracy')
plt.plot(train_sizes, test_mean_B, label='Validation Accuracy')
plt.xlabel('Training Size')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

No overfitting, strong generalization