In [56]:
import numpy as np
import pandas as pd

df = pd.read_csv("dermatology.csv", sep='\t', encoding="utf-8-sig", na_values="?")

df.columns = df.columns.str.strip()

Xfull = df.iloc[:,:-1].to_numpy()
yfull = df.iloc[:,-1].to_numpy()

mean_value = np.nanmean(Xfull)

# Replace NaNs with the mean
Xfull[np.isnan(Xfull)] = mean_value

print("Total NaNs in X_train:", np.isnan(Xfull).sum())

X_age = df["Age"].to_numpy()

print(Xfull)
print(yfull)

Total NaNs in X_train: 0
[[ 2.  2.  0. ...  1.  0. 55.]
 [ 3.  3.  3. ...  1.  0.  8.]
 [ 2.  1.  2. ...  2.  3. 26.]
 ...
 [ 3.  2.  2. ...  2.  3. 28.]
 [ 2.  1.  3. ...  2.  3. 50.]
 [ 3.  2.  2. ...  3.  0. 35.]]
[2 1 3 1 3 2 5 3 4 4 1 2 2 1 3 4 2 1 3 5 6 2 5 3 5 1 6 5 2 3 1 2 1 1 4 2 3
 2 3 1 2 4 1 2 5 3 4 6 2 3 3 4 1 1 5 1 2 3 4 2 6 1 5 1 2 3 1 4 5 1 2 6 3 5
 4 2 2 1 3 5 1 2 2 2 5 1 1 3 1 4 2 2 5 1 3 4 2 5 1 6 2 5 1 2 2 1 4 1 3 1 1
 3 5 3 3 5 2 3 4 1 2 5 6 1 1 2 6 3 5 4 1 1 3 5 5 1 4 2 3 1 2 1 1 3 3 3 2 5
 4 2 2 1 1 1 5 3 2 3 2 2 4 2 3 6 2 1 1 3 4 3 3 1 1 1 3 1 1 2 3 3 1 1 1 1 6
 2 2 2 2 1 3 3 3 1 1 2 3 2 2 2 5 5 5 5 5 1 1 1 1 1 1 1 3 3 3 3 3 3 4 4 4 4
 5 5 5 5 5 5 5 2 2 2 2 1 1 1 1 1 1 6 6 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 4 4 4
 4 4 4 5 5 5 5 6 6 6 4 4 4 1 1 1 1 1 2 2 4 4 4 1 1 2 2 2 3 3 3 3 1 1 1 1 5
 5 5 5 5 3 3 3 4 1 1 4 4 4 1 1 1 3 3 3 3 3 1 1 1 1 4 4 1 1 4 3 3 4 1 1 4 4
 5 5 1 1 5 5 3 1 5 5 6 6 4 4 6 6 6 1 1 1 5 5 1 1 1 1 2 2 4 4 3 3 1]


In [57]:
# Define the gradient descent function
def gradient_descent(X, y, learning_rate=0.0005, iterations=1000):
    X = X.reshape(-1, 1)
    y = y.reshape(-1, 1)
    m = X.shape[0]
    n = X.shape[1]

    print(m)
    print(n)

    theta = np.zeros((n, 1))
    cost_history = np.zeros(iterations)
    
    for i in range(iterations):
        prediction = X @ theta
        error = prediction - y
        theta = theta - (learning_rate / m) * (X.T @ error)
        cost_history[i] = compute_cost(X, y, theta)
    
    return theta, cost_history

# Define the cost computation function
def compute_cost(X, y, theta):
    m = len(y)
    prediction = np.dot(X, theta)
    cost = (1/(2*m)) * np.sum(np.square(prediction - y))
    return cost

In [58]:
def train_test_split(X, y, test_frac=0.2, seed=42):
    """
    Randomly split (X, y) into a training set and a test set.
    """
    rng = np.random.default_rng(seed)
    n = len(X)
    idx = np.arange(n)
    rng.shuffle(idx)

    test_size = int(np.round(test_frac * n))
    test_idx = idx[:test_size]
    train_idx = idx[test_size:]

    return X[train_idx], X[test_idx], y[train_idx], y[test_idx]

X_train, X_test, y_train, y_test = train_test_split(X_age, yfull, test_frac=0.2, seed=42)

In [59]:
def impute(X):
    # convert to float and replace "?" with np.nan
    X = np.array([np.nan if i == "?" else float(i) for i in X])

    # compute mean ignoring NaNs
    mean = np.nanmean(X)

    # replace NaNs with mean
    X[np.isnan(X)] = mean

    return X.astype(int)
    
X_train = impute(X_train)

#print(X_train)
w, cost_history = gradient_descent(X_train,y_train)

print(w)

ypred = w*y_test
y_test = y_test.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
#accuracy = accuracy_score(y_test, ypred)
#print(f"Accuracy: {accuracy}")

293
1
[[0.06198085]]


In [60]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


X_train, X_test, y_train, y_test = train_test_split(Xfull, yfull, test_frac=0.2, seed=42)

forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)
ypred = forest.predict(X_test)

accuracy = accuracy_score(y_test, ypred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9863013698630136


In [61]:
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(Xfull, yfull, test_frac=0.2, seed=42)

# 1. Brute Force
knn_brute = KNeighborsClassifier(n_neighbors=5, algorithm='brute')
knn_brute.fit(X_train, y_train)
y_pred_brute = knn_brute.predict(X_test)
print("Brute Accuracy:", accuracy_score(y_test, y_pred_brute))


# 2. KD-Tree
knn_kd = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
knn_kd.fit(X_train, y_train)
y_pred_kd = knn_kd.predict(X_test)
print("KD-Tree Accuracy:", accuracy_score(y_test, y_pred_kd))


# 3. Ball-Tree
knn_ball = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree')
knn_ball.fit(X_train, y_train)
y_pred_ball = knn_ball.predict(X_test)
print("Ball-Tree Accuracy:", accuracy_score(y_test, y_pred_ball))

Brute Accuracy: 0.8767123287671232
KD-Tree Accuracy: 0.8767123287671232
Ball-Tree Accuracy: 0.8767123287671232
