In [10]:
#Various Import Statements
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from collections import Counter

Data Preprocessing for Question 1 and 2 

In [3]:
# Load the breast cancer dataset from sklearn
cancer_data = load_breast_cancer()

# Features (X) and target labels (y)
X = cancer_data.data  # Feature data (measurements)
y = cancer_data.target  # Target labels (malignant or benign)

# Normalize the feature data to have mean 0 and standard deviation 1
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Split the dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.3)

KNN Implementation

In [4]:
# Function to calculate the Euclidean distance between two points
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

# Function to get the k nearest neighbors for a given test point
def get_k_nearest_neighbors(X_train, y_train, test_point, k):
    # Calculate the distance from the test point to all training points
    distances = [euclidean_distance(test_point, x_train) for x_train in X_train]
    
    # Get the indices of the k nearest neighbors
    k_indices = np.argsort(distances)[:k]
    
    # Get the labels of the k nearest neighbors
    k_nearest_labels = [y_train[i] for i in k_indices]
    
    return k_nearest_labels

# Function to predict the class of a single test point using the nearest neighbors
def predict_single(X_train, y_train, test_point, k):
    # Get the labels of the k nearest neighbors
    k_nearest_labels = get_k_nearest_neighbors(X_train, y_train, test_point, k)
    
    # Perform majority voting to determine the predicted class
    most_common = Counter(k_nearest_labels).most_common(1)
    return most_common[0][0]

# Function to predict classes for a test set
def predict(X_train, y_train, X_test, k):
    # Predict the class for each test point in the test set
    predictions = [predict_single(X_train, y_train, test_point, k) for test_point in X_test]
    return np.array(predictions)

Parameter Tuning of K 

In [5]:
# Loop through different values of k for KNN to find the optimal k
k_values = range(1, 21)  # Check k values from 1 to 20
accuracies_knn = []  # List to store accuracy for each k value

# Evaluate KNN for each k
for k in k_values:
    y_pred_knn = predict(X_train, y_train, X_test, k)  # Make predictions for the current k
    accuracy_knn = accuracy_score(y_test, y_pred_knn)  # Calculate accuracy for KNN
    accuracies_knn.append(accuracy_knn)  # Store the accuracy

# Find the optimal k based on maximum accuracy
optimal_k = k_values[np.argmax(accuracies_knn)]  # Get k value with the highest accuracy
max_accuracy_knn = max(accuracies_knn)  # Max accuracy for KNN

# KNN Predictions with Optimal k
y_pred_knn_optimal = predict(X_train, y_train, X_test, optimal_k)  # Predict using optimal k

# Evaluate KNN with optimal k
accuracy_knn_optimal = accuracy_score(y_test, y_pred_knn_optimal)  # Calculate accuracy
confusion_knn_optimal = confusion_matrix(y_test, y_pred_knn_optimal)  # Confusion matrix
report_knn_optimal = classification_report(y_test, y_pred_knn_optimal)  # Detailed classification report

Logistic Regression Implementation 

In [6]:
# Logistic Regression Model
log_reg_model = LogisticRegression()  # Initialize the Logistic Regression model
log_reg_model.fit(X_train, y_train)  # Fit the model on the training data

# Make predictions on the test data
y_pred_log_reg = log_reg_model.predict(X_test)

# Evaluate the performance of the Logistic Regression model
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)  # Calculate accuracy
confusion_log_reg = confusion_matrix(y_test, y_pred_log_reg)  # Confusion matrix
report_log_reg = classification_report(y_test, y_pred_log_reg)  # Detailed classification report

KNN and Logistic Regression Evaluations and Comparision of Performance

In [7]:
# Print results for KNN
print("\nKNN Performance:")
print(f"Optimal k: {optimal_k}")  # Print the optimal k value
print(f"Accuracy: {accuracy_knn_optimal:}")  # Print accuracy for KNN
print("Confusion Matrix:")
print(confusion_knn_optimal)  # Print confusion matrix for KNN
print("Classification Report:")
print(report_knn_optimal)  # Print classification report for KNN

# Print results for Logistic Regression
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_log_reg:}")  # Print accuracy
print("Confusion Matrix:")
print(confusion_log_reg)  # Print confusion matrix
print("Classification Report:")
print(report_log_reg)  # Print classification report



KNN Performance:
Optimal k: 3
Accuracy: 0.9532163742690059
Confusion Matrix:
[[ 52   6]
 [  2 111]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.90      0.93        58
           1       0.95      0.98      0.97       113

    accuracy                           0.95       171
   macro avg       0.96      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171

Logistic Regression Performance:
Accuracy: 0.9707602339181286
Confusion Matrix:
[[ 54   4]
 [  1 112]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.93      0.96        58
           1       0.97      0.99      0.98       113

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97       171
weighted avg       0.97      0.97      0.97       171

