<a href="https://colab.research.google.com/github/GoryachevDaniil/Breast_cancer_Classification/blob/main/Breast_cancer_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [None]:
breast_cancer = load_breast_cancer()

In [None]:
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [None]:
breast_cancer_df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)

In [None]:
breast_cancer_df['target'] = breast_cancer.target

In [None]:
breast_cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [None]:
unique, count = np.unique(breast_cancer_df.target, return_counts=True)
unique, count

(array([0, 1]), array([212, 357]))

In [None]:
breast_cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [None]:
breast_cancer_df.describe().round(2)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.13,19.29,91.97,654.89,0.1,0.1,0.09,0.05,0.18,0.06,...,25.68,107.26,880.58,0.13,0.25,0.27,0.11,0.29,0.08,0.63
std,3.52,4.3,24.3,351.91,0.01,0.05,0.08,0.04,0.03,0.01,...,6.15,33.6,569.36,0.02,0.16,0.21,0.07,0.06,0.02,0.48
min,6.98,9.71,43.79,143.5,0.05,0.02,0.0,0.0,0.11,0.05,...,12.02,50.41,185.2,0.07,0.03,0.0,0.0,0.16,0.06,0.0
25%,11.7,16.17,75.17,420.3,0.09,0.06,0.03,0.02,0.16,0.06,...,21.08,84.11,515.3,0.12,0.15,0.11,0.06,0.25,0.07,0.0
50%,13.37,18.84,86.24,551.1,0.1,0.09,0.06,0.03,0.18,0.06,...,25.41,97.66,686.5,0.13,0.21,0.23,0.1,0.28,0.08,1.0
75%,15.78,21.8,104.1,782.7,0.11,0.13,0.13,0.07,0.2,0.07,...,29.72,125.4,1084.0,0.15,0.34,0.38,0.16,0.32,0.09,1.0
max,28.11,39.28,188.5,2501.0,0.16,0.35,0.43,0.2,0.3,0.1,...,49.54,251.2,4254.0,0.22,1.06,1.25,0.29,0.66,0.21,1.0


In [None]:
data = breast_cancer_df.groupby('target').mean().T

In [None]:
data.head()

target,0,1
mean radius,17.46283,12.146524
mean texture,21.604906,17.914762
mean perimeter,115.365377,78.075406
mean area,978.376415,462.790196
mean smoothness,0.102898,0.092478


In [None]:
data['diff'] = abs(data.iloc[:, 0] - data.iloc[:, 1])

In [None]:
data = data.sort_values(by=['diff'], ascending=False)

In [None]:
data.head(10)

target,0,1,diff
worst area,1422.286321,558.89944,863.386881
mean area,978.376415,462.790196,515.586219
worst perimeter,141.37033,87.005938,54.364392
area error,72.672406,21.135148,51.537257
mean perimeter,115.365377,78.075406,37.289971
worst radius,21.134811,13.379801,7.75501
worst texture,29.318208,23.51507,5.803138
mean radius,17.46283,12.146524,5.316306
mean texture,21.604906,17.914762,3.690144
perimeter error,4.323929,2.000321,2.323608


In [None]:
features = list(data[:10].index)
features

['worst area',
 'mean area',
 'worst perimeter',
 'area error',
 'mean perimeter',
 'worst radius',
 'worst texture',
 'mean radius',
 'mean texture',
 'perimeter error']

In [None]:
# че за нах?

X = breast_cancer_df[features]
y = breast_cancer_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f'Train : {X_train.shape} {y_train.shape}\nTest: {X_test.shape} {y_test.shape}')

Train : (398, 10) (398,)
Test: (171, 10) (171,)


In [None]:
scaler = StandardScaler()

X_train_scaled =  scaler.fit_transform(X_train)
X_test_scaled =  scaler.fit_transform(X_test)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred[:5]

array([1, 0, 0, 1, 1])

In [None]:
model_matrix = confusion_matrix(y_test, y_pred, labels=[1, 0])
model_matrix_df = pd.DataFrame(model_matrix, \
                               columns=['Прогноз доброкачественное', 'Прогноз злокачественное'], \
                               index=['Факт доброкачественное', 'Факт злокачественное'])
model_matrix_df

Unnamed: 0,Прогноз доброкачественное,Прогноз злокачественное
Факт доброкачественное,106,2
Факт злокачественное,4,59


In [None]:
model_score = accuracy_score(y_test, y_pred)
model_score.round(2)

0.96