# **Pre-Processing for Hayes-roth dataset**

In [243]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/hayes-roth/hayes-roth.data"
column_names = ["name", "hobby", "age", "educational_level", "marital_status", "class"]
data = pd.read_csv(url, names=column_names)

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Convert categorical features to numerical using label encoding
label_encoder = LabelEncoder()
data['hobby'] = label_encoder.fit_transform(data['hobby'])
data['educational_level'] = label_encoder.fit_transform(data['educational_level'])
data['marital_status'] = label_encoder.fit_transform(data['marital_status'])

# Drop the 'name' column as it is not useful for modeling
data.drop('name', axis=1, inplace=True)

# Split the data into features and target
X = data.drop('class', axis=1)
y = data['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Missing values:
 name                 0
hobby                0
age                  0
educational_level    0
marital_status       0
class                0
dtype: int64
X_train shape: (105, 4)
X_test shape: (27, 4)
y_train shape: (105,)
y_test shape: (27,)


# **Pre-Processing for Car-evaluation dataset**

In [241]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]
data = pd.read_csv(url, names=column_names)

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Encode categorical features to numerical values
label_encoder = LabelEncoder()
data_encoded = data.apply(label_encoder.fit_transform)

# Split the data into features and target
X = data_encoded.drop('class', axis=1)
y = data_encoded['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Missing values:
 buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64
X_train shape: (1382, 6)
X_test shape: (346, 6)
y_train shape: (1382,)
y_test shape: (346,)


# **Pre-Processing for Breast_cancer dataset**

In [242]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
column_names = ["id", "diagnosis", "mean_radius", "mean_texture", "mean_smoothness", "mean_area", "class"]
data = pd.read_csv(url, names=column_names)

# Drop unnecessary columns (e.g., 'id' column)
data.drop('id', axis=1, inplace=True)

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)

# Encode the diagnosis (Malignant: M, Benign: B) to numerical values
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

# Split the data into features and target
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Missing values:
 diagnosis          0
mean_radius        0
mean_texture       0
mean_smoothness    0
mean_area          0
class              0
dtype: int64
X_train shape: (455, 5)
X_test shape: (114, 5)
y_train shape: (455,)
y_test shape: (114,)


 # **Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross validation, k=10 for Hayes-roth dataset**


In [232]:
# Import necessary functions/classes from the 'csv' module
from csv import reader   # 'reader' function for reading CSV files

# Import necessary function from the 'random' module
from random import randrange   # 'randrange' function for generating random integers within a specified range

# Function to load CSV data into a list of lists
def load_csv(file):
    data = []  # Initialize an empty list to store the data
    with open(file, 'r') as file:  # Open the CSV file in read mode
        csv_file = reader(file)  # Create a CSV reader object
        for row in csv_file:  # Iterate through each row in the CSV file
            if not row:  # Skip empty rows
                continue
            data.append(row)  # Append the non-empty row to the data list
    return data  # Return the loaded data as a list of lists


# Function to calculate Minkowski distance
def distance_calculation(tuple1, tuple2, n=2):
    """
    Calculate the Minkowski distance between two tuples.

    Parameters:
    tuple1 (list): First tuple of data points.
    tuple2 (list): Second tuple of data points.
    n (int): Parameter for the Minkowski distance calculation (default is 2 for Euclidean distance).

    Returns:
    float: Minkowski distance between the tuples.
    """
    distance = 0
    for i in range(len(tuple1) - 1):
        distance += (tuple1[i] - tuple2[i]) ** n
    return distance ** (1 / n)

# Function to make a classification prediction using k neighbors
def class_prediction(train_data, test_tuple, k):
    """
    Make a classification prediction using k-nearest neighbors.

    Parameters:
    train_data (list): Training data with labeled tuples.
    test_tuple (list): Tuple for which prediction is to be made.
    k (int): Number of nearest neighbors to consider.

    Returns:
    int: Predicted class label.
    """
    neighbors = get_k_neighbors(train_data, test_tuple, k)
    classes = [neighbor[-1] for neighbor in neighbors]
    predicted_class = max(set(classes), key=classes.count)
    return predicted_class

# Function to split dataset into k folds
def split_data_into_k_folds(data, k=10):
    """
    Split the dataset into k folds for k-fold cross-validation.

    Parameters:
    data (list): The dataset to be split into folds.
    k (int): The number of folds (default is 10).

    Returns:
    list: A list of k folds, where each fold is a subset of the dataset.
    """
    split_data = []
    data_copy = list(data)
    fold_size = int(len(data) / k)
    for _ in range(k):
        fold = []
        while len(fold) < fold_size:
            index = randrange(len(data_copy))
            fold.append(data_copy.pop(index))
        split_data.append(fold)
    return split_data

# Function to find k nearest neighbors
def get_k_neighbors(train_data, test_tuple, k):
    """
    Find the k nearest neighbors for a given test tuple.

    Parameters:
    train_data (list): The training data with labeled tuples.
    test_tuple (list): The test tuple for which neighbors are to be found.
    k (int): The number of nearest neighbors to consider.

    Returns:
    list: The k nearest neighbors to the test tuple.
    """
    distances = []
    for i in train_data:
        distance = distance_calculation(test_tuple, i)
        distances.append([i, distance])
    distances.sort(key=lambda t: t[1])
    neighbors = [t[0] for t in distances[:k]]
    return neighbors


# Function to calculate accuracy
def accuracy_calculation(actual_class, predicted_class):
    """
    Calculate the accuracy of predictions.

    Parameters:
    actual_class (list): True class labels.
    predicted_class (list): Predicted class labels.

    Returns:
    float: Accuracy of the predictions (between 0 and 1).
    """
    correct_prediction = sum(1 for a, p in zip(actual_class, predicted_class) if a == p)
    return correct_prediction / len(actual_class)

# Function to perform Min-Max scaling
def MinMaxScaler(data):
    """
    Perform Min-Max scaling on the dataset.

    Parameters:
    data (list): The dataset to be scaled.

    Returns:
    None: The function modifies the dataset in-place.
    """
    minmax = []
    for i in range(len(data[0]) - 1):
        column = [tuple[i] for tuple in data]
        minimum = min(column)
        maximum = max(column)
        minmax.append([minimum, maximum])
    for tuple in data:
        for i in range(len(tuple) - 1):
            tuple[i] = (tuple[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])


# Function to perform k-NN algorithm with k-fold cross-validation
def knn_algorithm(data, no_of_neighbors, k=10):
    """
    Perform k-NN algorithm with k-fold cross-validation.

    Parameters:
    data (list): The dataset for training and validation.
    no_of_neighbors (int): Number of neighbors to consider.
    k (int): Number of folds for cross-validation (default is 10).

    Returns:
    list: List of accuracy scores for each fold.
    """
    folds = split_data_into_k_folds(data)
    list_of_scores = []
    for fold in folds:
        train_set = sum([f for f in folds if f != fold], [])
        test_set = [tuple[:-1] for tuple in fold]  # Exclude the last element (class label)
        predicted_class = [class_prediction(train_set, tuple, no_of_neighbors) for tuple in test_set]
        actual_classes = [tuple[-1] for tuple in fold]
        accuracy = accuracy_calculation(actual_classes, predicted_class)
        list_of_scores.append(accuracy)
    return list_of_scores

# Load the dataset and preprocess it
file = 'hayes-roth.data'
data = load_csv(file)
dataframe = [[int(j) for j in i[1:]] for i in data]

k_folds = 10
no_of_neighbors = 3
MinMaxScaler(dataframe)

# Perform k-NN on training and validation set
knn_scratch_hayes = knn_algorithm(dataframe, no_of_neighbors)

# Print the results
print(f'Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross-validation for hayes-roth dataset, k=9:')
print(f'Scores with Train and Validation Set : {knn_scratch_hayes}')
print(f'Mean Accuracy (Train and Validation Set) : {(sum(knn_scratch_hayes) / len(knn_scratch_hayes)) * 100}')

# Perform k-NN on the testing set
test_data_df = [[int(j) for j in i] for i in data]  # Convert data to integers
MinMaxScaler(test_data_df)  # Scale the test data using Min-Max scaling
list_of_scores = []  # List to store accuracy scores
predicted_class = []  # List to store predicted classes

# Iterate through each tuple in the test data
for tuple in test_data_df:
    predicted_class.append(class_prediction(dataframe, tuple, no_of_neighbors))

actual_classes = [tuple[-1] for tuple in dataframe]  # Extract actual classes from the training data
accuracy = accuracy_calculation(actual_classes, predicted_class)  # Calculate accuracy
list_of_scores.append(accuracy)  # Append accuracy to the list of scores

# Print the results for the test data
print(f'Scores on Test Data : {list_of_scores}')
print(f'Mean Accuracy (Test Data) of scores for hayes-roth dataset : {(sum(list_of_scores) / len(list_of_scores)) * 100}')


Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross-validation for hayes-roth dataset, k=9:
Scores with Train and Validation Set : [0.7692307692307693, 0.5384615384615384, 0.46153846153846156, 0.6923076923076923, 0.38461538461538464, 0.6153846153846154, 0.5384615384615384, 0.5384615384615384, 0.5384615384615384, 0.6153846153846154]
Mean Accuracy (Train and Validation Set) : 56.92307692307692
Scores on Test Data : [0.38636363636363635]
Mean Accuracy (Test Data) of scores for hayes-roth dataset : 38.63636363636363


# **Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross validation, k=10 for Car dataset**

In [233]:
file_name = "car.data"
data = load_csv('car.data')
df = []
# Loop over each data entry in the dataset
for i in data:
    # Create an empty list for each data entry to store the transformed values
    tuple = []
    # Loop over each attribute in the data entry
    for j in i:
        # Transform 'vhigh', 'high', 'med', 'low' to numerical values
        if j == 'vhigh':
            tuple.append(1)
        elif j == 'high':
            tuple.append(2)
        elif j == 'med':
            tuple.append(3)
        elif j == 'low':
            tuple.append(4)

        # Transform '2', '3', '4', '5more' to numerical values
        elif j == '2':
            tuple.append(1)
        elif j == '3':
            tuple.append(2)
        elif j == '4':
            tuple.append(3)
        elif j == '5more':
            tuple.append(4)

        # Transform 'big', 'med', 'small' to numerical values
        elif j == 'big':
            tuple.append(1)
        elif j == 'med':
            tuple.append(2)
        elif j == 'small':
            tuple.append(3)

        # Transform 'unacc', 'acc', 'good', 'vgood' to numerical values
        elif j == 'unacc':
            tuple.append(1)
        elif j == 'acc':
            tuple.append(2)
        elif j == 'good':
            tuple.append(3)
        elif j == 'vgood':
            tuple.append(4)

    # Append the transformed tuple to the dataframe
    df.append(tuple)

# Define the number of folds for cross-validation
k_folds = 10
# Define the number of neighbors for the k-NN algorithm
no_neigh = 4

# Apply MinMaxScaler to the dataframe (assuming MinMaxScaler is a function defined elsewhere)
MinMaxScaler(df)

# Run the k-NN algorithm (assuming knn_algorithm is a function defined elsewhere)
knn_scratch_car = knn_algorithm(df, no_neigh, k_folds)

print(f'Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross-validation for Car dataset, k=9:')
print(f'Scores : {knn_scratch_car}')
print(f'Mean Accuracy : {(sum(knn_scratch_car) / len(knn_scratch_car))*100}')

Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross-validation for Car dataset, k=9:
Scores : [0.7093023255813954, 0.6453488372093024, 0.6104651162790697, 0.6627906976744186, 0.6686046511627907, 0.6511627906976745, 0.627906976744186, 0.6511627906976745, 0.6686046511627907, 0.686046511627907]
Mean Accuracy : 65.8139534883721


# **Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross validation, k=10 for Breast-cancer dataset**

In [234]:
# Load the breast-cancer dataset and preprocess it
file_name = "breast-cancer.data"
data = load_csv(file_name)

# Initialize an empty list to store the processed data
df = []
m = 0  # Counter for dataset rows

# Iterate through each row in the dataset
for i in data:
    tup = list()  # Create an empty list to store the processed tuple
    m += 1  # Increment the counter for each row

    # Iterate through each attribute in the tuple and map it to numerical values
    for j in i:
        if j == 'no-recurrence-events':
            tup.append(1)
        elif j == 'recurrence-events':
            tup.append(2)
        # ... (similar mappings for other attributes)

    df.append(tup)  # Append the processed tuple to the dataset

# Swap the first and last attributes for each tuple in the dataset
for i in df:
    temp = i[0]
    i[0] = i[-1]
    i[-1] = temp

k_folds = 10
no_of_neighbors_3 = 2

# Perform Min-Max scaling on the dataset
MinMaxScaler(df)

# Perform k-NN with k-fold cross-validation
knn_scratch_cancer = knn_algorithm(df, no_of_neighbors_3, k_folds)

# Print the results
print(f'Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross-validation for breast_cancer dataset, k=9:')
print(f'Scores : {knn_scratch_cancer}')
print(f'Mean Accuracy for breast-cancer dataset: {(sum(knn_scratch_cancer) / len(knn_scratch_cancer))*100}')


Implementation of KNN algorithm from scratch with functions to evaluate it with a k-fold cross-validation for breast_cancer dataset, k=9:
Scores : [0.2857142857142857, 0.6071428571428571, 0.8571428571428571, 0.6071428571428571, 0.7857142857142857, 0.75, 0.7142857142857143, 0.6785714285714286, 0.6785714285714286, 0.6071428571428571]
Mean Accuracy for breast-cancer dataset: 65.71428571428571


In [235]:
# Import necessary libraries
import numpy as np             # NumPy for numerical operations
import pandas as pd            # Pandas for handling data in tabular format
import matplotlib.pyplot as plt   # Matplotlib for plotting
import seaborn as sns          # Seaborn for enhanced data visualization
from sklearn import preprocessing   # scikit-learn's preprocessing module for data preprocessing
from sklearn.preprocessing import LabelEncoder   # LabelEncoder for label encoding
from sklearn.neighbors import KNeighborsClassifier   # KNeighborsClassifier for k-NN classification
from sklearn.model_selection import cross_val_score   # cross_val_score for cross-validation
import warnings               # Warnings to manage warnings during code execution
warnings.filterwarnings('ignore')   # Ignore warnings during execution
from scipy import stats       # SciPy for scientific and technical computing

# **Implementation of KNN algorithm with a k-fold cross validation, k=10 using scikit learn library for Hayes-roth dataset**


In [236]:

# Load the dataset into a DataFrame
df1 = pd.read_csv('hayes-roth.data', header=None)

# Convert categorical labels to numerical representations using LabelEncoder
label_encoder = LabelEncoder()
df1.iloc[:, -1] = label_encoder.fit_transform(df1.iloc[:, -1])

# Perform Min-Max scaling for features
scaler = preprocessing.MinMaxScaler()
df1_min_max_x = scaler.fit_transform(df1.iloc[:, 1:-1])

# Create and train a KNN model
model = KNeighborsClassifier(n_neighbors=3)
model.fit(df1_min_max_x, df1.iloc[:, -1])

# Perform cross-validation
knn_scikit_hayes = cross_val_score(estimator=model, X=df1_min_max_x, y=df1.iloc[:, -1], cv=10)

# Print the results
print('Implementation of KNN algorithm with k-fold cross-validation (k=9) using scikit-learn library for Hayes-Roth Dataset:')
print(f'Scores : {knn_scikit_hayes}')
print(f'Mean Accuracy for Hayes-Roth dataset: {np.mean(knn_scikit_hayes) * 100:.2f}')


Implementation of KNN algorithm with k-fold cross-validation (k=9) using scikit-learn library for Hayes-Roth Dataset:
Scores : [0.64285714 0.64285714 0.61538462 0.53846154 0.61538462 0.38461538
 0.53846154 0.53846154 0.69230769 0.69230769]
Mean Accuracy for Hayes-Roth dataset: 59.01


# **Implementation of KNN algorithm with a k-fold cross validation, k=10 using scikit learn library for Car dataset**


In [237]:
# Load the dataset into a DataFrame
df2 = pd.read_csv('car.data', header=None)

# Convert categorical labels to numerical representations using LabelEncoder
label_encoder = LabelEncoder()
for i in range(df2.shape[1]):
    df2.iloc[:, i] = label_encoder.fit_transform(df2.iloc[:, i])

# Perform Min-Max scaling for features
scaler = preprocessing.MinMaxScaler()
df2_min_max_x = scaler.fit_transform(df2.iloc[:, 0:-1])

# Create and train a KNN model
model = KNeighborsClassifier(n_neighbors=4)
model.fit(df2_min_max_x, df2.iloc[:, -1])

# Perform cross-validation
knn_scikit_car = cross_val_score(estimator=model, X=df2_min_max_x, y=df2.iloc[:, -1], cv=10)

# Print the results
print('Implementation of KNN algorithm with k-fold cross-validation (k=10) using scikit-learn library for Car Dataset:')
print(f'Scores : {knn_scikit_car}')
print(f'Mean Accuracy for Car dataset: {np.mean(knn_scikit_car) * 100:.2f}')


Implementation of KNN algorithm with k-fold cross-validation (k=10) using scikit-learn library for Car Dataset:
Scores : [0.7283237  0.73988439 0.87861272 0.68208092 0.87861272 0.8150289
 0.78034682 0.90751445 0.89534884 0.90116279]
Mean Accuracy for Car dataset: 82.07


# **Implementation of KNN algorithm with a k-fold cross validation, k=10 using scikit learn library for Breast-cancer dataset**


In [238]:
# Load the dataset into a DataFrame
df3 = pd.read_csv('breast-cancer.data', header=None)

# Convert categorical labels to numerical representations using LabelEncoder
label_encoder = LabelEncoder()
for i in range(df3.shape[1]):
    df3.iloc[:, i] = label_encoder.fit_transform(df3.iloc[:, i])

# Perform Min-Max scaling for features
scaler = preprocessing.MinMaxScaler()
df3_min_max_x = scaler.fit_transform(df3.iloc[:, 1:])

# Create and train a KNN model
model = KNeighborsClassifier(n_neighbors=2)
model.fit(df3_min_max_x, df3.iloc[:, 0])

# Perform cross-validation
knn_scikit_cancer = cross_val_score(estimator=model, X=df3_min_max_x, y=df3.iloc[:, 0], cv=10)

# Print the results
print('Implementation of KNN algorithm with k-fold cross-validation (k=10) using scikit-learn library for Breast Cancer Dataset:')
print(f'Scores : {knn_scikit_cancer}')
print(f'Mean Accuracy for Breast Cancer dataset: {np.mean(knn_scikit_cancer) * 100:.2f}')

Implementation of KNN algorithm with k-fold cross-validation (k=10) using scikit-learn library for Breast Cancer Dataset:
Scores : [0.68965517 0.68965517 0.72413793 0.72413793 0.75862069 0.68965517
 0.78571429 0.64285714 0.78571429 0.75      ]
Mean Accuracy for Breast Cancer dataset: 72.40


# **Hypothesis Testing results**

In [239]:
# Perform paired t-tests and hypothesis testing for Hayes-Roth dataset
t_stat_hayes, p_value_hayes = stats.ttest_rel(knn_scratch_hayes, knn_scikit_hayes)

# Check if the p-value is less than 0.05 (common significance level)
if p_value_hayes < 0.05:
    print(f"p = {p_value_hayes:.5f}, Since the generated KNN and scikit-learn KNN for the Hayes-Roth dataset differ significantly, we reject Ho (null hypothesis).")
else:
    print(f"p = {p_value_hayes:.5f}, Since the generated KNN and scikit-learn KNN for Hayes-Roth dataset have similar performance, we accept Ho (null hypothesis).")

# Perform paired t-tests and hypothesis testing for Car dataset
t_stat_car, p_value_car = stats.ttest_rel(knn_scratch_car, knn_scikit_car)

# Check if the p-value is less than 0.05 (common significance level)
if p_value_car < 0.05:
    print(f"p = {p_value_car:.5f}, Since the generated KNN and scikit-learn KNN for the Car dataset differ significantly, we reject Ho (null hypothesis).")
else:
    print(f"p = {p_value_car:.5f}, Since the generated KNN and scikit-learn KNN for Car dataset have similar performance, we accept Ho (null hypothesis).")

# Perform paired t-tests and hypothesis testing for Breast Cancer dataset
t_stat_cancer, p_value_cancer = stats.ttest_rel(knn_scratch_cancer, knn_scikit_cancer)

# Check if the p-value is less than 0.05 (common significance level)
if p_value_cancer < 0.05:
    print(f"p = {p_value_cancer:.5f}, Since the generated KNN and scikit-learn KNN for the Breast-cancer dataset differ significantly, we reject Ho (null hypothesis).")
else:
    print(f"p = {p_value_cancer:.5f}, Since the generated KNN and scikit-learn KNN for Breast-cancer dataset have similar performance, we accept Ho (null hypothesis).")


p = 0.67277, Since the generated KNN and scikit-learn KNN for Hayes-Roth dataset have similar performance, we accept Ho (null hypothesis).
p = 0.00032, Since the generated KNN and scikit-learn KNN for the Car dataset differ significantly, we reject Ho (null hypothesis).
p = 0.18899, Since the generated KNN and scikit-learn KNN for Breast-cancer dataset have similar performance, we accept Ho (null hypothesis).
