## **Importing the dependencies**

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## **Data Collection & Data Processing**

In [2]:
# Importing the dataset from drive
dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Projects/Breast_Cancer_Classification/data.csv')

In [3]:
dataset.drop(['Unnamed: 32','id'],axis=1,inplace=True)

In [4]:
# Loading the data to a dataframe
data_frame = pd.DataFrame(dataset)

In [5]:
# Number of rows and columns
data_frame.shape

(569, 31)

In [6]:
# Info of data
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

In [7]:
# Checking for the missing values
data_frame.isnull().sum()

Unnamed: 0,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0
symmetry_mean,0


In [8]:
# Statistical measures about the data
data_frame.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [9]:
# Converting alphabetical data to numeric
data_frame['diagnosis'] = data_frame['diagnosis'].map({'M':1,'B':0})

In [10]:
# Droping last 10 rows for random data prediction (not as test case)
forPrediction = data_frame.tail(10)
data_frame.drop(data_frame.tail(10).index,inplace=True)

In [11]:
# Checking the distribution of 'diagnosis' variable
data_frame['diagnosis'].value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
0,353
1,206


Here, in diagnosis:
*   **0 -> Benign**
*   **1 -> Malignant**

In [12]:
# Groupby diagnosis and getting mean of all, From this we can observe the data of benign cases and malignant cases
data_frame.groupby('diagnosis').mean()

Unnamed: 0_level_0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,12.158042,17.82034,78.154193,463.584136,0.092622,0.08016,0.046138,0.025771,0.174581,0.062896,...,13.392161,23.387762,87.094306,559.88102,0.125139,0.18305,0.166717,0.074718,0.270816,0.079492
1,17.413058,21.439223,114.997379,973.246602,0.102857,0.144412,0.158641,0.087235,0.192828,0.062668,...,21.091796,29.150825,140.923835,1417.901456,0.145089,0.372191,0.445084,0.181327,0.324173,0.091383


*   **From this, We can observe that the values of 'Malignant' cases are higher as compared to the values of 'Benign' cases.**


## **Splitting the data into Training and Testing**

In [13]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_frame, data_frame['diagnosis']):
    train = data_frame.loc[train_index]
    test = data_frame.loc[test_index]


In [14]:
print(data_frame.shape, train.shape, test.shape)

(559, 31) (447, 31) (112, 31)


## **Separating the Features and the Target**

In [15]:
X_train = train.drop(columns='diagnosis', axis=1)  # Features
Y_train = train['diagnosis']   # Target

X_test = test.drop(columns='diagnosis', axis=1)  # Features
Y_test = test['diagnosis']   # Target

## **Model Training**

### **Using Logistic Regression:**

In [16]:
model = LogisticRegression()

In [17]:
# Training the model using training data
model.fit(X_train, Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## **Model Evaluation**

## **Accuracy Score**

In [18]:
# Accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [19]:
print("Accuracy on training data: ", training_data_accuracy)

Accuracy on training data:  0.9463087248322147


In [20]:
# Accuracy on testing data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [21]:
print("Accuracy on test data: ", test_data_accuracy)

Accuracy on test data:  0.9375


## **Building a Predictive System**

In [22]:
input_data = (20.6,29.33,140.1,1265,0.1178,0.277,0.3514,0.152,0.2397,0.07016,0.726,1.595,5.772,86.22,0.006522,0.06158,0.07117,0.01664,0.02324,0.006185,25.74,39.42,184.6,1821,0.165,0.8681,0.9387,0.265,0.4087,0.124)

# Changing the input data to a numpy array
idnpary = np.asarray(input_data)

# Reshaping the numpy array as we are predicting for one data point
input_data_reshaped = idnpary.reshape(1,-1)

prediction = model.predict(input_data_reshaped)

if (prediction[0] == 0):
  print("The Breast Cancer is Benign")
else:
  print("The Breast Cancer is Malignant")

The Breast Cancer is Malignant


