In [None]:
# Classification, Dataset: Prostate Cancer

In [7]:
import pandas as pd
import numpy as np
dataset=pd.read_csv("https://raw.githubusercontent.com/DrUzair/MLSD/master/Datasets/Prostate_Cancer.csv")

dataset = dataset.drop('id', 1) # drop id column

dataset.describe()

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,16.85,18.23,96.78,702.88,0.10273,0.1267,0.19317,0.06469
std,4.879094,5.192954,23.676089,319.710895,0.014642,0.061144,0.030785,0.008151
min,9.0,11.0,52.0,202.0,0.07,0.038,0.135,0.053
25%,12.0,14.0,82.5,476.75,0.0935,0.0805,0.172,0.059
50%,17.0,17.5,94.0,644.0,0.102,0.1185,0.19,0.063
75%,21.0,22.25,114.25,917.0,0.112,0.157,0.209,0.069
max,25.0,27.0,172.0,1878.0,0.143,0.345,0.304,0.097


In [17]:
# The data set contains patients who have been diagnosed with either Malignant (M) or Benign (B) cancer.
print(dataset['diagnosis_result'].value_counts())

M    62
B    38
Name: diagnosis_result, dtype: int64


In [None]:
#Data Normalization

In [18]:
#Normalizing numeric data
def normalize(x):
  return ((x - min(x)) / (max(x) - min(x))) 

In [21]:
X = list(set(list(dataset)) - set(['diagnosis_result']))
#dataset_x = dataset.drop('diagnosis_result', 1)
prc_n = dataset.iloc[:,].copy()
prc_n[X] = prc_n[X].apply(normalize)
prc_n

Unnamed: 0,diagnosis_result,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
0,M,0.8750,0.0625,0.825000,0.448687,1.000000,0.781759,0.633136,0.590909
1,B,0.0000,0.1250,0.675000,0.670644,1.000000,0.133550,0.272189,0.090909
2,M,0.7500,1.0000,0.650000,0.597255,0.753425,0.397394,0.426036,0.159091
3,M,0.3125,0.3125,0.216667,0.109785,0.000000,0.801303,0.739645,1.000000
4,M,0.0000,0.5000,0.691667,0.653341,0.972603,0.309446,0.272189,0.136364
...,...,...,...,...,...,...,...,...,...
95,M,0.8750,0.3125,0.666667,0.633652,0.287671,0.302932,0.443787,0.068182
96,B,0.8125,0.1875,0.216667,0.148568,0.479452,0.107492,0.325444,0.295455
97,B,0.6250,1.0000,0.083333,0.055489,0.438356,0.048860,0.000000,0.363636
98,B,0.7500,0.8125,0.183333,0.125895,0.273973,0.120521,0.159763,0.295455


In [None]:
#Alternate way for normalizing the dataset

In [20]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

dataset_x = dataset.drop('diagnosis_result', 1)
prc_n = dataset_x.iloc[:,].copy()

scaler.fit(prc_n[X])
prc_n = scaler.transform(prc_n[X])

In [None]:
#The first variable in our data set (after removal of id) is ‘diagnosis_result’ which is not numeric in nature. So, we start from 2nd variable. The function apply() applies normalize() to each feature in the data frame. Let’s check using the variable ‘radius’ whether the data has been normalized.

In [5]:
#Creating training and test data set:

AttributeError: 'numpy.ndarray' object has no attribute 'describe'

In [22]:
import numpy as np
prc_n.describe()

Unnamed: 0,radius,texture,perimeter,area,smoothness,compactness,symmetry,fractal_dimension
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.490625,0.451875,0.373167,0.298854,0.448356,0.288925,0.344201,0.265682
std,0.304943,0.32456,0.197301,0.190758,0.200572,0.199165,0.18216,0.185249
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.1875,0.1875,0.254167,0.163932,0.321918,0.138436,0.218935,0.136364
50%,0.5,0.40625,0.35,0.263723,0.438356,0.262215,0.325444,0.227273
75%,0.75,0.703125,0.51875,0.426611,0.575342,0.387622,0.43787,0.363636
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [23]:
import numpy as np
prc_train = prc_n.iloc[1:65, :]
prc_test = prc_n.iloc[66:100, :]
prc_train['diagnosis_result']

1     B
2     M
3     M
4     M
5     B
     ..
60    B
61    B
62    M
63    B
64    M
Name: diagnosis_result, Length: 64, dtype: object

In [None]:
#Our target variable is ‘diagnosis_result’ which we have not included in our training and test data sets.#

In [None]:
#kNN Model

In [None]:
#The knn needs to be used to train a model for which we need to import kNeighborsClassifier from sklearn.neighbors. The kNeighborsClassifier() function identifies the k-nearest neighbors using Euclidean distance where k is a user-specified number. Now we are ready to use the function to classify test data

In [24]:
from sklearn.neighbors import KNeighborsClassifier  
classifier = KNeighborsClassifier(n_neighbors=5) 
y = prc_train['diagnosis_result']
classifier.fit(prc_train[X], prc_train['diagnosis_result']) 


KNeighborsClassifier()

In [25]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

KNeighborsClassifier(n_jobs=1)

In [None]:
#Prediction Results

In [26]:
y_pred = classifier.predict(prc_test[X])  
y_pred

array(['M', 'B', 'M', 'B', 'M', 'M', 'M', 'B', 'B', 'M', 'M', 'M', 'M',
       'B', 'M', 'M', 'M', 'M', 'B', 'M', 'M', 'M', 'M', 'M', 'B', 'M',
       'M', 'M', 'M', 'M', 'B', 'B', 'B', 'M'], dtype=object)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(prc_test['diagnosis_result'], y_pred))  
print(classification_report(prc_test['diagnosis_result'], y_pred))  

[[ 9 10]
 [ 1 14]]
              precision    recall  f1-score   support

           B       0.90      0.47      0.62        19
           M       0.58      0.93      0.72        15

    accuracy                           0.68        34
   macro avg       0.74      0.70      0.67        34
weighted avg       0.76      0.68      0.66        34

