Importing the Dependencies


In [1]:
import numpy as np
import pandas as pd
import sklearn.datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error


Data Collection And Processing


In [2]:
# loading the data from sklearn
breast_cancer_dataset = sklearn.datasets.load_breast_cancer()

In [3]:
# loading the data into a pd dframe

breastCancerDF = pd.DataFrame(
    breast_cancer_dataset.data, columns=breast_cancer_dataset.feature_names
)

In [4]:
# adding the target column
breastCancerDF["label"] = breast_cancer_dataset.target

breastCancerDF.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [5]:
breastCancerDF.shape

(569, 31)

In [6]:
# checking the data
breastCancerDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [7]:
# checking the distribution of label variable

# 0 = benign (non cancerous)
# 1 = malignant ( cancerous)

breastCancerDF["label"].value_counts()

label
1    357
0    212
Name: count, dtype: int64

In [8]:
breastCancerDF.groupby("label").mean()

Unnamed: 0_level_0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268,...,21.134811,29.318208,141.37033,1422.286321,0.144845,0.374824,0.450606,0.182237,0.323468,0.09153
1,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867,...,13.379801,23.51507,87.005938,558.89944,0.124959,0.182673,0.166238,0.074444,0.270246,0.079442


Seperating the features and target(labels)


In [9]:
X = breastCancerDF.drop("label", axis=1)
Y = breastCancerDF["label"]

Splitting the data into training and testing data


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=2
)

In [11]:
print(X.shape, X_train.shape, X_test.shape)

(569, 30) (455, 30) (114, 30)


### Model Training


Logistic Regression


In [12]:
model = LogisticRegression()

model.fit(X_train,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation

Accuracy Score

In [13]:
# Calculate the accuracy of the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print(f"Accuracy of the training data: {training_data_accuracy:.2%}")

Accuracy of the training data: 94.95%


In [14]:
# Calculate the accuracy of the testing data
X_test_prediction = model.predict(X_test)
testing_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print(f"Accuracy of the testing data: {testing_data_accuracy:.2%}")

Accuracy of the testing data: 92.98%


In [15]:
# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(Y_test, X_test_prediction)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

Mean Absolute Error (MAE): 0.0702


In [16]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(Y_test, X_test_prediction)
print(f"Mean Squared Error (MSE): {mse:.4f}")

Mean Squared Error (MSE): 0.0702


In [17]:
# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

Root Mean Squared Error (RMSE): 0.2649


Building a predictive system

In [18]:
# Create an input data point for prediction
input_data = (
    13.08,
    15.71,
    85.63,
    520,
    0.1075,
    0.127,
    0.04568,
    0.0311,
    0.1967,
    0.06811,
    0.1852,
    0.7477,
    1.383,
    14.67,
    0.004097,
    0.01898,
    0.01698,
    0.00649,
    0.01678,
    0.002425,
    14.5,
    20.49,
    96.09,
    630.5,
    0.1312,
    0.2776,
    0.189,
    0.07283,
    0.3184,
    0.081838510824,
)

# Convert the input data to a NumPy array
input_data_NP = np.asarray(input_data)

# Reshape the input data to match the model's expected shape
input_data_reshaped = input_data_NP.reshape(1, -1)

# Make a prediction using the trained model
prediction = model.predict(input_data_reshaped)
if prediction[0] == 0:
    print("Prediction: The tumor is predicted to be benign (non-cancerous).")
else:
    print("Prediction: The tumor is predicted to be malignant (cancerous).")

Prediction: The tumor is predicted to be malignant (cancerous).


