In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
data = load_breast_cancer()
data

# features --> numpy array (data)
# labels (target)
# column names (feature_names)
# full description of dataset (DESCR)

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]], shape=(569, 30)),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,

In [None]:
print(data.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
<class 'sklearn.utils._bunch.Bunch'>


In [5]:
# 569 rows (patients) × 30 features (measurements like radius, area, texture, etc.)
print(data.data.shape)

(569, 30)


In [6]:
print(data.target[:25])
# 0 -> malignant (cancerous)
# 1 -> benign (non-cancerous)
print(data.target_names)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0]
['malignant' 'benign']


In [7]:
print(data.feature_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [8]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer Wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

In [9]:
X = data.data
y = data.target

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)

In [11]:
# Logistic Regression iterative process use karta hai (gradually apne weights update karta hai)
# Agar model specified iterations (default 100 or 300) me converge na kare,
# to sklearn warning deta hai ke max_iter barhao taki model aur iterations le sake
# aur apne parameters better tarah se adjust karke converge kar jaye.
model = LogisticRegression(max_iter=3000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,3000


In [17]:
# Get predicted probabilities for each class (0 and 1) on the test set
# predict_proba() returns an array of shape (n_samples, 2)
# [:, 1] extracts probabilities of the positive class (class = 1)
y_prob = model.predict_proba(X_test)[:,1] # numpy array[row_index, column_index]
print("Sample predicted probabilities", y_prob[:10])

Sample predicted probabilities [9.87926483e-01 2.10607634e-04 3.79497149e-08 9.99750427e-01
 9.99988254e-01 9.99857756e-01 9.75602114e-01 9.99797315e-01
 9.99890445e-01 9.98745633e-01]


In [18]:

# For default threshold 0.5:
# If predicted probability >= 0.5 → predict 1 (True), else 0 (False)
y_pred_05 = (y_prob>=0.5).astype(int) # [False, True, True, False] → [0, 1, 1, 0]

# For stricter threshold 0.7:
# Only predict 1 if probability >= 0.7 (model must be more confident)
y_pred_07 = (y_prob>=0.7).astype(int)

# For very high threshold 0.9:
# Model predicts 1 only when it’s almost certain 
y_pred_03 = (y_prob>=0.9).astype(int) 

In [20]:
print("When Threshold = 0.5")

# Shows counts of:
# [ [True Negatives, False Positives],
#   [False Negatives, True Positives] ]
print(confusion_matrix(y_test, y_pred_05))

# Print detailed classification metrics
print(classification_report(y_test, y_pred_05))

When Threshold = 0.5
[[ 56   5]
 [  2 108]]
              precision    recall  f1-score   support

           0       0.97      0.92      0.94        61
           1       0.96      0.98      0.97       110

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.95       171
weighted avg       0.96      0.96      0.96       171



In [15]:
print("When Threshold = 0.7")
print(confusion_matrix(y_test, y_pred_07))
print(classification_report(y_test, y_pred_07))

When Threshold = 0.7
[[ 58   3]
 [  2 108]]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96        61
           1       0.97      0.98      0.98       110

    accuracy                           0.97       171
   macro avg       0.97      0.97      0.97       171
weighted avg       0.97      0.97      0.97       171



In [16]:
print("When Threshold = 0.3")
print(confusion_matrix(y_test, y_pred_03))
print(classification_report(y_test, y_pred_03))

When Threshold = 0.3
[[ 58   3]
 [  7 103]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92        61
           1       0.97      0.94      0.95       110

    accuracy                           0.94       171
   macro avg       0.93      0.94      0.94       171
weighted avg       0.94      0.94      0.94       171

