In [26]:
import pandas as pd
data_frame = pd.read_csv('breast-cancer-wisconsin.data')
data_frame = data_frame.apply(pd.to_numeric, errors='coerce')
data_frame = data_frame.fillna(0).astype(int)


In [None]:
normalized_data = data_frame.copy()
for data in normalized_data:
    max_value = data_frame[data].max()
    min_value = data_frame[data].min() 
    normalized_data[data] = (normalized_data[data]-min_value)/(max_value-min_value)
print(normalized_data)

In [33]:
# Performed K-Nearest Neighbour

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


X = normalized_data.iloc[:, :-1]  
y = normalized_data.iloc[:, -1]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

k = 5 

knn_classifier = KNeighborsClassifier(n_neighbors=k)

knn_classifier.fit(X_train, y_train)

y_pred = knn_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Data:", accuracy)

# actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# print("\nActual vs. Predicted Values:")
# print(actual_vs_predicted)


Accuracy on Test Data: 0.9761904761904762


In [32]:
# Perform Univariate Trees

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = normalized_data.iloc[:, :-1]  
y = normalized_data.iloc[:, -1]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9428571428571428


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

X = normalized_data.iloc[:, :-1]  
y = normalized_data.iloc[:, -1]  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
# print("Classification Report:\n", report)


Accuracy: 0.9714285714285714


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import numpy as np


X = normalized_data.iloc[:, :-1].values  
y = normalized_data.iloc[:, -1].values   


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


knn_classifier = KNeighborsClassifier(n_neighbors=5)
clf = DecisionTreeClassifier()
gnb = GaussianNB()


knn_classifier.fit(X_train, y_train)
clf.fit(X_train, y_train)
gnb.fit(X_train, y_train)

# Create lists to store misclassified samples for each classifier
misclassified_samples_knn = []
misclassified_samples_tree = []
misclassified_samples_gnb = []

# Loop through test samples and classifiers
for idx, (test_sample, true_class) in enumerate(zip(X_test, y_test)):
    # K-Nearest Neighbors (KNN)
    test_sample = np.array(test_sample).reshape(1, -1)  # Reshape the input data
    predicted_class_knn = knn_classifier.predict(test_sample)[0]
    if predicted_class_knn != true_class:
        misclassified_samples_knn.append((idx, true_class, predicted_class_knn))

    # Decision Tree Classifier
    test_sample = np.array(test_sample).reshape(1, -1)  # Reshape the input data
    predicted_class_tree = clf.predict(test_sample)[0]
    if predicted_class_tree != true_class:
        misclassified_samples_tree.append((idx, true_class, predicted_class_tree))

    # Gaussian Naive Bayes Classifier
    test_sample = np.array(test_sample).reshape(1, -1)  # Reshape the input data
    predicted_class_gnb = gnb.predict(test_sample)[0]
    if predicted_class_gnb != true_class:
        misclassified_samples_gnb.append((idx, true_class, predicted_class_gnb))

# Find samples misclassified by all classifiers
misclassified_by_all = []
for sample_knn in misclassified_samples_knn:
    for sample_tree in misclassified_samples_tree:
        for sample_gnb in misclassified_samples_gnb:
            if (
                sample_knn[0] == sample_tree[0] == sample_gnb[0] and
                sample_knn[1] == sample_tree[1] == sample_gnb[1] and
                sample_knn[2] == sample_tree[2] == sample_gnb[2]
            ):
                misclassified_by_all.append(sample_knn)

# Prepare a report of misclassified samples by all classifiers
print("misclassified samples")
for idx, true_class, predicted_class in misclassified_by_all:
    print(f"Sample {idx}: True Class - {true_class}, Predicted Class - {predicted_class}")

from sklearn.metrics import confusion_matrix

# Calculate confusion matrix for K-Nearest Neighbors
cm_knn = confusion_matrix(y_test, knn_classifier.predict(X_test))
print("Confusion Matrix for K-Nearest Neighbors:")
print(cm_knn)

# Calculate confusion matrix for Decision Tree Classifier
cm_tree = confusion_matrix(y_test, clf.predict(X_test))
print("\nConfusion Matrix for Decision Tree Classifier:")
print(cm_tree)

# Calculate confusion matrix for Gaussian Naive Bayes Classifier
cm_gnb = confusion_matrix(y_test, gnb.predict(X_test))
print("\nConfusion Matrix for Gaussian Naive Bayes Classifier:")
print(cm_gnb)



misclassified samples
Sample 71: True Class - 0.0, Predicted Class - 1.0
Sample 109: True Class - 0.0, Predicted Class - 1.0
Sample 118: True Class - 0.0, Predicted Class - 1.0
Confusion Matrix for K-Nearest Neighbors:
[[86  3]
 [ 0 51]]

Confusion Matrix for Decision Tree Classifier:
[[86  3]
 [ 5 46]]

Confusion Matrix for Gaussian Naive Bayes Classifier:
[[85  4]
 [ 0 51]]
