Question 1
Decision Tree

In [30]:
import pandas as pd
import numpy as np


data = pd.read_csv("car_evaluation.csv")


data['buying price'] = data['buying price'].astype('category').cat.codes
data['maintenance cost'] = data['maintenance cost'].astype('category').cat.codes
data['number of doors'] = data['number of doors'].astype('category').cat.codes
data['number of persons'] = data['number of persons'].astype('category').cat.codes
data['lug_boot'] = data['lug_boot'].astype('category').cat.codes
data['safety'] = data['safety'].astype('category').cat.codes
data['decision'] = data['decision'].astype('category').cat.codes

train_size = int(0.8 * len(data))
train_data = data[:train_size]
test_data = data[train_size:]


def calculate_gini_impurity(labels):
    unique_labels, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    gini_impurity = 1 - np.sum(probabilities**2)
    return gini_impurity



def find_best_split(data): #best split
    best_gini = 1
    best_split = None
    features = data.columns[:-1]

    for feature in features:
        unique_values = data[feature].unique()
        for value in unique_values:
            left_data = data[data[feature] <= value]
            right_data = data[data[feature] > value]
            left_gini = calculate_gini_impurity(left_data['decision'])
            right_gini = calculate_gini_impurity(right_data['decision'])
            weighted_gini = (len(left_data) / len(data)) * \
                left_gini + (len(right_data) / len(data)) * right_gini
            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_split = (feature, value)

    return best_split


class DecisionTree:
    def __init__(self):
        self.left = None
        self.right = None
        self.feature = None
        self.value = None
        self.label = None

    def fit(self, data):
        unique_labels = data['decision'].unique()
        if len(unique_labels) == 1:
            self.label = unique_labels[0]
            return
        best_split = find_best_split(data)
        if best_split is None:
            self.label = unique_labels[np.argmax(
                np.bincount(data['decision']))]
            return
        self.feature, self.value = best_split
        left_data = data[data[self.feature] <= self.value]
        right_data = data[data[self.feature] > self.value]
        self.left = DecisionTree()
        self.left.fit(left_data)
        self.right = DecisionTree()
        self.right.fit(right_data)

    def predict(self, x):
        if self.label is not None:
            return self.label
        if x[self.feature] <= self.value:
            return self.left.predict(x)
        else:
            return self.right.predict(x)


tree = DecisionTree()
tree.fit(train_data)

correct = 0
total = len(test_data)
for index, row in test_data.iterrows():
    prediction = tree.predict(row)
    if prediction == row['decision']:
        correct += 1

accuracy = correct / total
print("Accuracy:", accuracy)


Accuracy: 0.8265895953757225


In [33]:
num_folds = 5
fold_size = len(data) // num_folds
validation_size = 0.1
accuracies = []

for fold in range(num_folds):
    start = fold * fold_size
    end = (fold + 1) * fold_size
    validation_data = data[start:end]
    train_data = pd.concat([data[:start], data[end:]])

    tree = DecisionTree()
    tree.fit(train_data)

    correct = 0
    total = len(validation_data)
    for index, row in validation_data.iterrows():
        prediction = tree.predict(row)
        if prediction == row['decision']:
            correct += 1

    accuracy = correct / total
    accuracies.append(accuracy)

average_accuracy = np.mean(accuracies)
print(" Average Accuracy over 5-fold Cross-Validation :", average_accuracy)


 Average Accuracy over 5-fold Cross-Validation with k=5: 0.8324637681159419


Question 2
KNN classifier

In [21]:
import pandas as pd
import numpy as np
from collections import Counter

data = pd.read_csv("car_evaluation.csv")

def categorical_to_numerical(data):
    category_map = {}
    for column in data.columns:
        unique_values = data[column].unique()
        category_map[column] = {value: idx for idx,value in enumerate(unique_values)}
        data[column] = data[column].map(category_map[column])
    return data, category_map


data, category_map = categorical_to_numerical(data)

def train_test_split(data, test_size=0.2):
    num_samples = len(data)
    test_samples = int(test_size * num_samples)
    shuffled_indices = np.random.permutation(num_samples)
    test_indices = shuffled_indices[:test_samples]
    train_indices = shuffled_indices[test_samples:]
    return data.iloc[train_indices], data.iloc[test_indices]


train_data, test_data = train_test_split(data)

def knn_classifier(train_data, test_data, k=5):
    predictions = []
    for _, test_instance in test_data.iterrows():
        distances = []
        for _, train_instance in train_data.iterrows():
            euclidean_distance = np.linalg.norm(
                test_instance[:-1] - train_instance[:-1])
            distances.append((euclidean_distance, train_instance[-1]))
        distances.sort(key=lambda x: x[0])
        k_nearest_neighbors = distances[:k]
        neighbor_labels = [neighbor[1] for neighbor in k_nearest_neighbors]
        most_common_label = Counter(neighbor_labels).most_common(1)[0][0]
        predictions.append(most_common_label)
    return predictions


k = 5 
predictions = knn_classifier(train_data, test_data, k)

def accuracy(y_true, y_pred):
    correct = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    return correct / len(y_true)


test_labels = test_data['decision'].values
accuracy_score = accuracy(test_labels, predictions)
print(f"Accuracy of KNN Classifier with k={k}: {accuracy_score:.2f}")


Accuracy of KNN Classifier with k=5: 0.96


In [24]:
data = data.sample(frac=1).reset_index(drop=True)

num_folds = 5
validation_size = 0.1
fold_size = int(len(data) / num_folds)
folds = []
for i in range(num_folds):
    start_idx = i * fold_size
    end_idx = start_idx + fold_size
    if i == num_folds - 1:
        end_idx = len(data)
    fold = data[start_idx:end_idx]
    folds.append(fold)
    
accuracies = []
for i in range(num_folds):
    validation_data = folds[i]
    train_data = pd.concat([fold for j, fold in enumerate(folds) if j != i])

    k = 5  
    predictions = knn_classifier(train_data, validation_data, k)

    test_labels = validation_data['decision'].values
    accuracy_score = accuracy(test_labels, predictions)
    accuracies.append(accuracy_score)

average_accuracy = np.mean(accuracies)
print(f"Average Accuracy over {num_folds}-fold Cross-Validation with k={k}: {average_accuracy:.2f}")


Average Accuracy over 5-fold Cross-Validation with k=5: 0.95
