In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
pd.set_option('future.no_silent_downcasting', True)

# Dropping unwanted columns
df= pd.read_csv('../dataset/breast_cancer_data.csv')
df['diagnosis'] = df['diagnosis'].replace({'M': 1, 'B': 0}).astype(int)
df.drop(columns=['id', 'Unnamed: 32', 'radius_se', 'texture_se', 'perimeter_se', 'area_se',
  'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se',
  'symmetry_se', 'fractal_dimension_se',
  'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
  'smoothness_worst', 'compactness_worst', 'concavity_worst',
  'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
df.shape

(569, 11)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   diagnosis               569 non-null    int32  
 1   radius_mean             569 non-null    float64
 2   texture_mean            569 non-null    float64
 3   perimeter_mean          569 non-null    float64
 4   area_mean               569 non-null    float64
 5   smoothness_mean         569 non-null    float64
 6   compactness_mean        569 non-null    float64
 7   concavity_mean          569 non-null    float64
 8   concave points_mean     569 non-null    float64
 9   symmetry_mean           569 non-null    float64
 10  fractal_dimension_mean  569 non-null    float64
dtypes: float64(10), int32(1)
memory usage: 46.8 KB


In [5]:
# Checking for null values
df.isnull().sum()

diagnosis                 0
radius_mean               0
texture_mean              0
perimeter_mean            0
area_mean                 0
smoothness_mean           0
compactness_mean          0
concavity_mean            0
concave points_mean       0
symmetry_mean             0
fractal_dimension_mean    0
dtype: int64

In [6]:
# Statistical measures about the data
df.describe()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798
std,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706
min,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996
25%,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577
50%,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154
75%,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612
max,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744


In [7]:
# Checking the distribution of Target Variable
df['diagnosis'].value_counts()

diagnosis
0    357
1    212
Name: count, dtype: int64

# 1--> M
# 0--> B

In [8]:
df.groupby('diagnosis').mean()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,12.146524,17.914762,78.075406,462.790196,0.092478,0.080085,0.046058,0.025717,0.174186,0.062867
1,17.46283,21.604906,115.365377,978.376415,0.102898,0.145188,0.160775,0.08799,0.192909,0.06268


In [9]:
# Separating features and target

X = df.drop(columns='diagnosis', axis=1)
y = df['diagnosis']

In [10]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Splitting into training and test data
X_train, X_test, y_train, y_test= train_test_split(X_scaled, y, test_size=0.2, random_state=50, stratify=y)
# print(X.shape, X_train.shape, X_test.shape)
# print(y.shape, y_train.shape, y_test.shape)

In [12]:
# Model training
model= LogisticRegression()
model.fit(X_train, y_train)

In [13]:
# Model evaluation
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(y_train, X_train_prediction)
print('Accuracy **training data** ', training_data_accuracy)

Accuracy **training data**  0.9428571428571428


In [14]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(y_test, X_test_prediction)
print('Accuracy **test data** ', test_data_accuracy)

Accuracy **test data**  0.9473684210526315


In [16]:
input_data = [[
    10.2,   # radius_mean
    65.0,   # perimeter_mean
    330.0,  # area_mean
    0.02,   # concavity_mean (low irregularity)
    0.015,  # concave points_mean
    0.04,   # compactness_mean
    13.5,   # texture_mean
    0.08,   # smoothness_mean
    0.15,   # symmetry_mean
    0.05    # fractal_dimension_mean
]]


prediction = model.predict(input_data)
print(prediction)

if (prediction[0] == 1):
  print('The Breast cancer is Malignant')

else:
  print('The Breast Cancer is Benign')

[1]
The Breast cancer is Malignant
