## Breast Cancer Classification

## Importing the Libraries

In [1]:
import numpy as np
import pandas as pd

import warnings

warnings.filterwarnings('ignore')

## Loading the dataset

In [2]:
data = pd.read_csv('Data.csv')

In [3]:
# Total number of records
n_records = len(data)

# Number of records where Target Class is 0
class_benign = len(data[data['Class'] == 2])

# Number of records where Target Class is 1
class_malignant = len(data[data['Class'] == 4])

# Percentage of record where Target Class is 1
percent_malignant = round(class_malignant/n_records,5)*100



# Results

print(f'Total Number of records : {n_records}')
print(f'Number of records where Target Class(benign) is 2 : {class_benign}')
print(f'Number of records where Target Class(malignant) is 4 : {class_malignant}')
print(f'Percentage of record where Target Class is 4 (malignant) : {percent_malignant}')

Total Number of records : 683
Number of records where Target Class(benign) is 2 : 444
Number of records where Target Class(malignant) is 4 : 239
Percentage of record where Target Class is 4 (malignant) : 34.993


In [4]:
data.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
# Matrix of feature
X = data.iloc[ :,1:-1].values

# Target Vector
y = data.iloc[ :,-1].values

In [6]:
print(X.shape,y.shape)

(683, 9) (683,)


## Splitting the dataset into training set and test set

In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)

In [8]:
print(X_train.shape, y_train.shape)

(512, 9) (512,)


In [9]:
print(X_test.shape,y_test.shape)

(171, 9) (171,)


## Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaled_train = scaler.fit_transform(X_train)
scaled_test = scaler.transform(X_test)

## Creating the Model

In [11]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion = 'entropy')
classifier.fit(X_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [17]:
print(f'Decision tree training Score {classifier.score(X_train,y_train)}')
print(f'Decision tree test Score {classifier.score(X_train,y_train)}')

Decision tree training Score 1.0


## Evaluating the model on test set

In [12]:
predictions = classifier.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix,classification_report

cm = confusion_matrix(y_test,predictions)

In [14]:
cm

array([[115,   4],
       [  5,  47]], dtype=int64)

In [16]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           2       0.96      0.97      0.96       119
           4       0.92      0.90      0.91        52

    accuracy                           0.95       171
   macro avg       0.94      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171

