In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
target_names = data.target_names

# Display dataset information
print("Number of features:", len(feature_names))
print("Target names:", target_names)
print("\nShape of data:", X.shape)
print("Shape of target:", y.shape)
print("\nClass distribution:", np.bincount(y))
print("\nFirst feature names:", feature_names[:5])

Number of features: 30
Target names: ['malignant' 'benign']

Shape of data: (569, 30)
Shape of target: (569,)

Class distribution: [212 357]

First feature names: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness']


In [3]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("\nClass distribution in training set:", np.bincount(y_train))
print("Class distribution in testing set:", np.bincount(y_test))

Training set size: (398, 30)
Testing set size: (171, 30)

Class distribution in training set: [148 250]
Class distribution in testing set: [ 64 107]


In [4]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy: {:.2f}%".format(accuracy_rf*100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, target_names=target_names))

Random Forest Accuracy: 93.57%

Confusion Matrix:
 [[ 58   6]
 [  5 102]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.92      0.91      0.91        64
      benign       0.94      0.95      0.95       107

    accuracy                           0.94       171
   macro avg       0.93      0.93      0.93       171
weighted avg       0.94      0.94      0.94       171



In [5]:
# Initialize and train Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb_model.predict(X_test)

# Evaluate model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy: {:.2f}%".format(accuracy_gb*100))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb, target_names=target_names))

Gradient Boosting Accuracy: 94.74%

Confusion Matrix:
 [[ 58   6]
 [  3 104]]

Classification Report:
               precision    recall  f1-score   support

   malignant       0.95      0.91      0.93        64
      benign       0.95      0.97      0.96       107

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171



In [7]:
# Sample prediction using Random Forest
sample_idx = 5  # You can change this index to test different samples
sample = X_test[sample_idx].reshape(1, -1)
true_label = target_names[y_test[sample_idx]]
predicted_label = target_names[rf_model.predict(sample)[0]]

print("Sample features (first 5):", sample[0][:5])
print(f"True diagnosis: {true_label}")
print(f"Predicted diagnosis: {predicted_label}")

# Get feature importances and pair them with feature names
feature_importances = list(zip(feature_names, rf_model.feature_importances_))
# Sort by importance (descending)
feature_importances.sort(key=lambda x: x[1], reverse=True)

print("\nTop 5 most important features:")
for name, importance in feature_importances[:5]:
    print(f"{name}: {importance:.4f}")

Sample features (first 5): [1.546e+01 2.395e+01 1.038e+02 7.313e+02 1.183e-01]
True diagnosis: malignant
Predicted diagnosis: malignant

Top 5 most important features:
worst concave points: 0.1590
worst area: 0.1470
worst perimeter: 0.0858
worst radius: 0.0790
mean radius: 0.0777
