# Breast Cancer Classification using Logistic Regression
## @maheshaViduranga
## Medium link: https://medium.com/@maheshaviduranga


### Data loading and preprocessing

In [1]:
# Link to the Dataset = https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset

# Import necessary models

import numpy as np
import pandas as pd
import os
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Read the csv information
patient_information_data = pd.read_csv('/content/drive/MyDrive/Machine Learning/Logistic Regression/Dataset/breast-cancer.csv')

In [3]:
# Visualize the data structure
patient_information_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
print(patient_information_data.shape)

(569, 32)


In [32]:
# Visualize statistical parameters of data
patient_information_data.describe()

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [6]:
# Number of samples in each class
patient_information_data['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64



*   B means having Benign
*   M means Malignant




In [7]:
# Only take 210 samples from majority class to prevent bias of the model
num_samples_to_keep = 210

# Undersample the majority class
undersampled_patient_information_data = pd.concat([
    patient_information_data[patient_information_data['diagnosis'] == 'B'].sample(num_samples_to_keep, random_state=42),
    patient_information_data[patient_information_data['diagnosis'] == 'M']
], axis=0)

# Shuffle the resulting DataFrame to ensure randomness
undersampled_patient_information_data = undersampled_patient_information_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [8]:
# Visualize undersampled data
undersampled_patient_information_data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,874839,B,12.30,15.90,78.83,463.7,0.08080,0.07253,0.03844,0.01654,...,13.35,19.59,86.65,546.7,0.1096,0.1650,0.14230,0.04815,0.2482,0.06306
1,885429,M,19.73,19.82,130.70,1206.0,0.10620,0.18490,0.24170,0.09740,...,25.28,25.59,159.80,1933.0,0.1710,0.5955,0.84890,0.25070,0.2749,0.12970
2,862989,B,10.49,19.29,67.41,336.1,0.09989,0.08578,0.02995,0.01201,...,11.54,23.31,74.22,402.8,0.1219,0.1486,0.07987,0.03203,0.2826,0.07552
3,901088,M,20.44,21.78,133.80,1293.0,0.09150,0.11310,0.09799,0.07785,...,24.31,26.37,161.20,1780.0,0.1327,0.2376,0.27020,0.17650,0.2609,0.06735
4,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.45,26.40,166.10,2027.0,0.1410,0.2113,0.41070,0.22160,0.2060,0.07115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
417,9010259,B,13.05,18.59,85.09,512.0,0.10820,0.13040,0.09603,0.05603,...,14.19,24.85,94.22,591.2,0.1343,0.2658,0.25730,0.12580,0.3113,0.08317
418,925277,B,14.59,22.68,96.39,657.1,0.08473,0.13300,0.10290,0.03736,...,15.48,27.27,105.90,733.5,0.1026,0.3171,0.36620,0.11050,0.2258,0.08004
419,86135502,M,19.02,24.59,122.00,1076.0,0.09029,0.12060,0.14680,0.08271,...,24.56,30.41,152.90,1623.0,0.1249,0.3206,0.57550,0.19560,0.3956,0.09288
420,88995002,M,20.73,31.12,135.70,1419.0,0.09469,0.11430,0.13670,0.08646,...,32.49,47.16,214.00,3432.0,0.1401,0.2644,0.34420,0.16590,0.2868,0.08218


In [27]:
# Seperating input and labels
X = undersampled_patient_information_data.iloc[: , 2:] # undersampled_patient_information_data.drop(columns = 'diagnosis', columns = 'id', axis = 1)
Y = undersampled_patient_information_data['diagnosis']

In [28]:
print(X)
print(Y)

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          12.30         15.90           78.83      463.7          0.08080   
1          19.73         19.82          130.70     1206.0          0.10620   
2          10.49         19.29           67.41      336.1          0.09989   
3          20.44         21.78          133.80     1293.0          0.09150   
4          21.56         22.39          142.00     1479.0          0.11100   
..           ...           ...             ...        ...              ...   
417        13.05         18.59           85.09      512.0          0.10820   
418        14.59         22.68           96.39      657.1          0.08473   
419        19.02         24.59          122.00     1076.0          0.09029   
420        20.73         31.12          135.70     1419.0          0.09469   
421        16.30         15.70          104.70      819.8          0.09427   

     compactness_mean  concavity_mean  concave points_mean  sym

### Dataset Creation for train and test

In [29]:
# Split train and test set
X_train , X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, random_state = 2)

In [30]:
print(X_train.shape , X_test.shape, Y_train.shape, Y_test.shape)

(316, 30) (106, 30) (316,) (106,)


### Train using logistic regression

In [33]:
# Initialize the model
model = LogisticRegression()

In [34]:
# Training Phase
model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation

In [35]:
# On training data
X_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(X_train_pred,Y_train)

In [36]:
print("Training Accuracy is: ", train_accuracy)

Training Accuracy is:  0.9493670886075949


In [37]:
# On test data
X_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(X_test_pred,Y_test)

In [38]:
print("Test Accuracy is: ", test_accuracy)

Test Accuracy is:  0.9245283018867925


### Predictions

In [41]:
# Get a sample Malignant input from the dataset
input = np.array([15.78,17.89,103.6,781,0.0971,0.1292,0.09954,0.06606,0.1842,0.06082,0.5058,0.9849,3.564,54.16,0.005771,0.04061,0.02791,0.01282,0.02008,0.004144,20.42,27.28,136.5,1299,0.1396,0.5609,0.3965,0.181,0.3792,0.1048])

In [50]:
# Reshape it to have 2D vector
input = input.reshape(1,-1)

In [51]:
input.shape

(1, 30)

In [52]:
# Predictions
model.predict(input)



array(['M'], dtype=object)