### Importing the dataset

In [1]:
import pandas as pd

Data_set = pd.read_csv("Breast_Cancer.csv", delimiter=",")
print(Data_set)

     Age   Race Marital Status N Stage 6th Stage  Tumor Size  Survival Months  \
0     68  White        Married      N1       IIA           4               60   
1     50  White        Married      N2      IIIA          35               62   
2     58  White       Divorced      N3      IIIC          63               75   
3     58  White        Married      N1       IIA          18               84   
4     47  White        Married      N1       IIB          41               50   
..   ...    ...            ...     ...       ...         ...              ...   
245   64  Black        Single       N1       IIA           9              103   
246   66  White        Single       N1       IIA           8               96   
247   45  White        Married      N1       IIB          28               40   
248   37  White       Divorced      N1       IIA          15               78   
249   50  Other        Married      N1       IIA          17               92   

                 differenti

<hr />

### Getting the desired features (i.e., columns) from the dataset

In [2]:
Features_names = Data_set.columns[0:7]  # all except the 'differentiate' column
print(Features_names)

Index(['Age', 'Race', 'Marital Status', 'N Stage', '6th Stage', 'Tumor Size',
       'Survival Months'],
      dtype='object')


<hr />

### Retrieving and formatting the target/goal (i.e., differentiate feature)

In [3]:
target = Data_set['differentiate'].tolist()
target = list(set(target))  # we used a set to retrieve the unique names only
print(target)

['Poorly differentiated', 'Moderately differentiated', 'Well differentiated']


<hr />

### Getting the values of the retrieved columns

In [4]:
X = Data_set[Features_names].values
# Or Data_set[['Age', 'Race', 'Marital Status', 'N Stage', '6th Stage', 'Tumor Size', 'Survival Months']].values

print(X)

[[68 'White' 'Married' ... 'IIA' 4 60]
 [50 'White' 'Married' ... 'IIIA' 35 62]
 [58 'White' 'Divorced' ... 'IIIC' 63 75]
 ...
 [45 'White' 'Married' ... 'IIB' 28 40]
 [37 'White' 'Divorced' ... 'IIA' 15 78]
 [50 'Other' 'Married' ... 'IIA' 17 92]]


<hr />

### Data Preprocessing Step (Categorical data to numeric data, for distance functions)
##### LabelEncoder() is used to encode categorical data as numeric data

In [5]:
from sklearn import preprocessing

# Race Feature
label_Race_variations = Data_set['Race'].tolist()
label_Race_variations = list(set(label_Race_variations))
print(label_Race_variations)

label_Race = preprocessing.LabelEncoder()
label_Race.fit(label_Race_variations)  # ['White', 'Black', 'Other']
X[:, 1] = label_Race.transform(X[:, 1])


# Marital Status Feature
label_Marital_Status_variations = Data_set['Marital Status'].tolist()
label_Marital_Status_variations = list(set(label_Marital_Status_variations))
print(label_Marital_Status_variations)

label_Marital_Status = preprocessing.LabelEncoder()
label_Marital_Status.fit(label_Marital_Status_variations)  # ['Married', 'Divorced', 'Single ', 'Widowed', 'Separated']
X[:, 2] = label_Marital_Status.transform(X[:, 2])


# N Stage Feature
label_N_Stage_variations = Data_set['N Stage'].tolist()
label_N_Stage_variations = list(set(label_N_Stage_variations))
print(label_N_Stage_variations)

label_N_Stage = preprocessing.LabelEncoder()
label_N_Stage.fit(label_N_Stage_variations)  # ['N3', 'N2', 'N1']
X[:, 3] = label_N_Stage.transform(X[:, 3])


# 6th Stage Feature
label_6th_Stage_variations = Data_set['6th Stage'].tolist()
label_6th_Stage_variations = list(set(label_6th_Stage_variations))
print(label_6th_Stage_variations)

label_6th_Stage = preprocessing.LabelEncoder()
label_6th_Stage.fit(label_6th_Stage_variations)  # ['IIIC', 'IIIA', 'IIIB', 'IIA', 'IIB']
X[:, 4] = label_6th_Stage.transform(X[:, 4])


# print(X)

['White', 'Other', 'Black']
['Separated', 'Divorced', 'Widowed', 'Married', 'Single ']
['N3', 'N2', 'N1']
['IIIB', 'IIIA', 'IIB', 'IIIC', 'IIA']


<hr />

### Splitting the dataset into training and testing sets

In [6]:
from sklearn.model_selection import train_test_split

Y = Data_set["differentiate"]  # Terget/Goal
print(Y)

# Dimensions of the dataset (i.e., rows and cols)
print(Data_set.shape)

# Split the data into (250 x 0.2 = 50 rows for testing) and (200 - 50 = 200 rows for training)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)  # 3 samples per iteration

print(X_train.shape)
print(X_test.shape)

print(Y_train.shape)
print(Y_test.shape)

0          Poorly differentiated
1      Moderately differentiated
2      Moderately differentiated
3          Poorly differentiated
4          Poorly differentiated
                 ...            
245        Poorly differentiated
246          Well differentiated
247        Poorly differentiated
248    Moderately differentiated
249    Moderately differentiated
Name: differentiate, Length: 250, dtype: object
(250, 8)
(200, 7)
(50, 7)
(200,)
(50,)


<hr />

### KNN (K-Nearest Neighbors) Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# Apply the KNN classifier with n=3
neigh = KNeighborsClassifier(n_neighbors=3)  # best value for n is sqrt(n)

# Train the Model using Training Sets (i.e. Classified data)
neigh.fit(X_train, Y_train)


# Do prediction on the testing set
predicted = neigh.predict(X_test)
print(predicted.shape)
print("\nPredicted by KNN:\n", predicted)


# Compare the predicted results with the predefined data (i.e., Accuracy)
results = metrics.confusion_matrix(Y_test, predicted)
print("\nKNN confusion matrix:\n", results)

print("\nKNN Accuracy: ", metrics.accuracy_score(Y_test, predicted))

(50,)

Predicted by KNN:
 ['Poorly differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Poorly differentiated'
 'Poorly differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Poorly differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Poorly differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Poorly differentiated'
 'Moderately differentiated' 'Poorly differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Poorly differentiated'
 'Moderately differentiated' 'Moderat

<hr />

### Naive Bayes Classifier

In [8]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Create a GaussianNB Classifier
model = GaussianNB()

# Train the Model using Training Sets (i.e. Classified data)
model.fit(X_train, Y_train)


# Do prediction on the testing set
predicted = model.predict(X_test)
print(predicted.shape)
print("\nPredicted by Naive Bayes:\n", predicted)


# Compare the predicted results with the predefined data (i.e., Accuracy)
results = metrics.confusion_matrix(Y_test, predicted)
print("\nNaive Bayes confusion matrix:\n", results)

print("\nNaive Bayes Accuracy: ", metrics.accuracy_score(Y_test, predicted))

(50,)

Predicted by Naive Bayes:
 ['Moderately differentiated' 'Moderately differentiated'
 'Well differentiated' 'Poorly differentiated' 'Poorly differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Poorly differentiated' 'Poorly differentiated' 'Well differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Poorly differentiated'
 'Moderately differentiated' 'Well differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Well differentiated' 'Well differentiated'
 'Well differentiated' 'Poorly differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Well differentiated' 'Moderately differentiated' 'Poorly differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately differentiated'
 'Moderately differentiated' 'Moderately 