In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

Preprocessing the dataset

In [5]:
cancer_data = load_breast_cancer()

X = cancer_data.data  # Feature data (measurements)
y = cancer_data.target  # Target labels (malignant or benign)

# Normalize: mean 0 and standard deviation 1
X_normalized = StandardScaler().fit_transform(X)

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2)

KNN Implementation

In [6]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

def get_k_nearest_neighbors(X_train, y_train, p, k):

    #Distance from the test point to all training points
    distances = [euclidean_distance(p, x_train) for x_train in X_train]
    
    k_indices = np.argsort(distances)[:k]
    
    # Get the labels of the k nearest neighbors
    k_nearest_labels = [y_train[i] for i in k_indices]
    
    return k_nearest_labels

# Function to predict the class of a single test point using the nearest neighbors
def predict_point(X_train, y_train, p, k):
    # Get the labels of the k nearest neighbors
    k_nearest_labels = get_k_nearest_neighbors(X_train, y_train, p, k)
    
    # Perform majority voting to determine the predicted class
    most_common = Counter(k_nearest_labels).most_common(1)
    return most_common[0][0]

# Function to predict classes for a test set
def predict(X_train, y_train, X_test, k):
    # Predict the class for each test point in the test set
    predictions = [predict_point(X_train, y_train, test_point, k) for test_point in X_test]
    return np.array(predictions)

Parameter Tuning of K 

In [7]:
# Loop through different values of k for KNN to find the optimal k
k_values = range(1, 21) 
accuracies_knn = [] 

# Evaluate KNN for each k
for k in k_values:
    y_pred_knn = predict(X_train, y_train, X_test, k)  # Make predictions for the current k
    accuracy_knn = accuracy_score(y_test, y_pred_knn)  # Calculate accuracy for KNN
    accuracies_knn.append(accuracy_knn)  # Store the accuracy

# Find the optimal k based on maximum accuracy
optimal_k = k_values[np.argmax(accuracies_knn)]  # Get k value with the highest accuracy
max_accuracy_knn = max(accuracies_knn)  # Max accuracy for KNN

# KNN Predictions with Optimal k
y_pred_knn_optimal = predict(X_train, y_train, X_test, optimal_k)  # Predict using optimal k

# Evaluate KNN with optimal k
accuracy_knn_optimal = accuracy_score(y_test, y_pred_knn_optimal)  # Calculate accuracy
confusion_knn_optimal = confusion_matrix(y_test, y_pred_knn_optimal)  # Confusion matrix
report_knn_optimal = classification_report(y_test, y_pred_knn_optimal)  # Detailed classification report

Logistic Regression Implementation 

In [8]:
# Logistic Regression Model
log_reg_model = LogisticRegression()  # Initialize the Logistic Regression model
log_reg_model.fit(X_train, y_train)  # Fit the model on the training data

# Make predictions on the test data
y_pred_log_reg = log_reg_model.predict(X_test)

# Evaluate the performance of the Logistic Regression model
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)  # Calculate accuracy
confusion_log_reg = confusion_matrix(y_test, y_pred_log_reg)  # Confusion matrix
report_log_reg = classification_report(y_test, y_pred_log_reg)  # Detailed classification report

KNN and Logistic Regression Evaluations and Comparision of Performance

In [9]:
# Print results for KNN
print("\nKNN Performance:")
print(f"Optimal k: {optimal_k}")  # Print the optimal k value
print(f"Accuracy: {accuracy_knn_optimal:}")  # Print accuracy for KNN
print("Confusion Matrix:")
print(confusion_knn_optimal)  # Print confusion matrix for KNN
print("Classification Report:")
print(report_knn_optimal)  # Print classification report for KNN

# Print results for Logistic Regression
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_log_reg:}")  # Print accuracy
print("Confusion Matrix:")
print(confusion_log_reg)  # Print confusion matrix
print("Classification Report:")
print(report_log_reg)  # Print classification report



KNN Performance:
Optimal k: 4
Accuracy: 0.9649122807017544
Confusion Matrix:
[[40  3]
 [ 1 70]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Logistic Regression Performance:
Accuracy: 0.9736842105263158
Confusion Matrix:
[[40  3]
 [ 0 71]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        43
           1       0.96      1.00      0.98        71

    accuracy                           0.97       114
   macro avg       0.98      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

