Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection & Processing

In [2]:
# loading the data from sklearn
breast_cancer_dataset = pd.read_csv('C:\\Users\\koush\\OneDrive\\Documents\\AI-based-Healthcare-Chatbot-and-Disease-Detection-System\\cancer.csv')

In [3]:
print(breast_cancer_dataset)

    diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0           M        17.99         10.38          122.80     1001.0   
1           M        20.57         17.77          132.90     1326.0   
2           M        19.69         21.25          130.00     1203.0   
3           M        11.42         20.38           77.58      386.1   
4           M        20.29         14.34          135.10     1297.0   
..        ...          ...           ...             ...        ...   
564         M        21.56         22.39          142.00     1479.0   
565         M        20.13         28.25          131.20     1261.0   
566         M        16.60         28.08          108.30      858.1   
567         M        20.60         29.33          140.10     1265.0   
568         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concavepoints_mean  \
0            0.11840           0.27760         0.30010             0.

In [4]:
# print the first 5 rows of the dataframe
breast_cancer_dataset.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concavepoints_mean,symmetry_mean,fractaldimension_mean
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [5]:
# adding the 'target' column to the data frame
breast_cancer_dataset['label'] = breast_cancer_dataset.diagnosis

In [6]:
# print last 5 rows of the dataframe
breast_cancer_dataset.tail()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concavepoints_mean,symmetry_mean,fractaldimension_mean,label
564,M,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,M
565,M,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,M
566,M,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,M
567,M,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,M
568,B,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,B


In [7]:
# number of rows and columns in the dataset
breast_cancer_dataset.shape

(569, 12)

In [8]:
# getting some information about the data
breast_cancer_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   diagnosis              569 non-null    object 
 1   radius_mean            569 non-null    float64
 2   texture_mean           569 non-null    float64
 3   perimeter_mean         569 non-null    float64
 4   area_mean              569 non-null    float64
 5   smoothness_mean        569 non-null    float64
 6   compactness_mean       569 non-null    float64
 7   concavity_mean         569 non-null    float64
 8   concavepoints_mean     569 non-null    float64
 9   symmetry_mean          569 non-null    float64
 10  fractaldimension_mean  569 non-null    float64
 11  label                  569 non-null    object 
dtypes: float64(10), object(2)
memory usage: 53.5+ KB


In [9]:
# checking for missing values
breast_cancer_dataset.isnull().sum()

diagnosis                0
radius_mean              0
texture_mean             0
perimeter_mean           0
area_mean                0
smoothness_mean          0
compactness_mean         0
concavity_mean           0
concavepoints_mean       0
symmetry_mean            0
fractaldimension_mean    0
label                    0
dtype: int64

In [10]:
# statistical measures about the data
breast_cancer_dataset.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concavepoints_mean,symmetry_mean,fractaldimension_mean
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744


In [11]:
# checking the distribution of Target Varibale
breast_cancer_dataset['diagnosis'].value_counts()

diagnosis
B    357
M    212
Name: count, dtype: int64

1 --> Benign

0 --> Malignant

In [12]:
# Calculate the mean of numerical columns grouped by 'diagnosis'
numeric_columns = breast_cancer_dataset.select_dtypes(include=[np.number])
grouped_mean = numeric_columns.groupby(breast_cancer_dataset['diagnosis']).mean()

print(grouped_mean)


           radius_mean  texture_mean  perimeter_mean   area_mean  \
diagnosis                                                          
B            12.146524     17.914762       78.075406  462.790196   
M            17.462830     21.604906      115.365377  978.376415   

           smoothness_mean  compactness_mean  concavity_mean  \
diagnosis                                                      
B                 0.092478          0.080085        0.046058   
M                 0.102898          0.145188        0.160775   

           concavepoints_mean  symmetry_mean  fractaldimension_mean  
diagnosis                                                            
B                    0.025717       0.174186               0.062867  
M                    0.087990       0.192909               0.062680  


Separating the features and target

In [13]:
X = breast_cancer_dataset.drop(columns=['diagnosis', 'label'], axis=1)
Y = breast_cancer_dataset['label'] 

In [14]:
print(X)

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10     1297.0          0.10030   
..           ...           ...             ...        ...              ...   
564        21.56         22.39          142.00     1479.0          0.11100   
565        20.13         28.25          131.20     1261.0          0.09780   
566        16.60         28.08          108.30      858.1          0.08455   
567        20.60         29.33          140.10     1265.0          0.11780   
568         7.76         24.54           47.92      181.0          0.05263   

     compactness_mean  concavity_mean  concavepoints_mean  symm

In [15]:
print(Y)

0      M
1      M
2      M
3      M
4      M
      ..
564    M
565    M
566    M
567    M
568    B
Name: label, Length: 569, dtype: object


Splitting the data into training data & Testing data

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

In [17]:
print(X.shape, X_train.shape, X_test.shape)

(569, 10) (455, 10) (114, 10)


Model Training

Logistic Regression

In [18]:
classifier = svm.SVC(kernel='linear')

In [19]:
# training the Logistic Regression model using Training data
classifier.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [20]:
# accuracy on training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [21]:
print('Accuracy on training data = ', training_data_accuracy)

Accuracy on training data =  0.9098901098901099


In [22]:
# accuracy on test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [23]:
print('Accuracy on test data = ', test_data_accuracy)

Accuracy on test data =  0.9473684210526315


Building a Predictive System

In [24]:
input_data = (13.54,14.36,87.46,566.3,0.09779,0.08129,0.06664,0.04781,0.1885,0.05766)

# change the input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array as we are predicting for one datapoint
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The Breast cancer is Malignant')

else:
  print('The Breast Cancer is Benign')



['B']
The Breast Cancer is Benign




Saving the training model

In [25]:
import pickle

In [26]:
filename = 'cancer_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [27]:
# loading the saved model
loaded_model = pickle.load(open('cancer_model.sav', 'rb'))

In [28]:
# feature names printing
for column in X.columns:
  print(column)

radius_mean
texture_mean
perimeter_mean
area_mean
smoothness_mean
compactness_mean
concavity_mean
concavepoints_mean
symmetry_mean
fractaldimension_mean
