In [145]:
import pandas as pd

df = pd.read_csv(
    "dermatology.csv",
    sep="\t",
    encoding="utf-8-sig",
    na_values="?"
)

df.columns = df.columns.str.strip()

y_val = df.iloc[:,-1].to_numpy()
X_val = df["Age"].to_numpy()

In [146]:
# splitting the data frame into train validate and test

def split_train_val_test(X, y, train_size=0.7, test_size=0.3, seed = 42):
    assert abs(train_size+ test_size - 1.0) < 1e-9

    n = len(X) # Gives the number of rows

    rng = np.random.default_rng(seed)

    idx = rng.permutation(n)

    n_train = int(train_size * n)
    train_idx = idx[:n_train]
    test_idx = idx[n_train:]

    return X[train_idx], y[train_idx], X[test_idx], y[test_idx]


In [147]:
# Create a gradient decent
def compute_cost(X, y, theta):
    m = len(y)
    y_hat = X @ theta
    cost = (1/(2*m)) * np.sum(np.square(y_hat - y))

    return float(cost)

import numpy as np

def batch_gradient_descent(X, y, learning_rate=0.001, epsilon=1e-4, max_iteration=5):
    """
    Iteratively updates the weights to minimize the cost.

    :param X: The features
    :param y: The labels
    :param learning_rate: The step size of how much to change theta
    :param epsilon: The convergence tolerance
    :param max_iteration: The maximum number of iterations
    :return: Returns a weight vector derived with batch gradient decent.
    """
    # Checks if the array is 2D array.
    X = np.asarray(X)
    y = np.asarray(y)

    if X.ndim == 1:
        X = X.reshape(-1, 1)
    if y.ndim == 1:
        y = y.reshape(-1, 1)

    # Gets the m and n dimension (m,n)
    m, n = X.shape

    # Initialize theta matrix with all ones
    theta = np.ones((n, 1), dtype=int)

    costs = [compute_cost(X, y, theta)]
    current_iteration = 0

    # This section is the gradient decent
    while current_iteration < max_iteration:
        y_hat = X @ theta
        diff = y_hat - y

        # A vector with the same shape as theta. Gives direction of where the cost increases fastest.
        # Large grad means the slope of the cost is steep. Small grad means the slope of the cost is flat.
        grad = (X.T @ diff) / m

        # The individual weights
        theta = theta - learning_rate * grad

        new_cost = compute_cost(X, y, theta)
        costs.append(new_cost)

        if abs(costs[-1] - costs[-2]) <= epsilon:
            break

        if costs[-1] > costs[-2]:
            print("Cost is increasing â€” reduce alpha.")
            break

        current_iteration += 1

    print("Completed in", current_iteration, "iterations.")
    return theta, costs



In [148]:
# Imputing the values so that it replaces the nan values with the mean to not encourage biases but also not removing
# data

def impute(X):
    """
    Replaces nan values with the column mean.

    :param X: the X column
    :return: new array with replaced nan values
    """
    _X = []


    num_nans = 0

    for index in range(len(X)):
        if np.isnan(X[index]):
            num_nans+=1


    def _get_median(x):
        """
        Get the median of the column to fill in nan values.

        :param x: The feature column
        :return: The mean of the column
        """
        r = [x[0]]
        _index = 0
        for j in range(len(x)):
            if not np.isnan(x[j]):
                if r[_index] > x[j]:
                    buff = r.pop(_index)
                    r.append(x[j])
                    r.append(buff)
                else:
                    r.append(x[j])
                _index += 1


        return r[int(len(r)/2)]

    meadian = _get_median(X)

    index = 0
    for x_val in X:
        if np.isnan(x_val):
            _X.append(int(meadian))
        else:
            _X.append(int(x_val))
        index += 1

    return np.asarray(_X)

In [149]:
def setClassifier(y_hat_vals):
    _y = []
    for _i in range(len(y_hat_vals)):
        if int(y_hat_vals[_i]) == 0:
            _y.append(1)
        else:
            _y.append(int(round(y_hat_vals[_i], 1)))

    return _y


In [150]:

avg_acc = 0
seed_min = 0
seed_max = 100

for i in range(seed_min, seed_max + 1, 1):
    X_train, y_train, X_test, y_test = split_train_val_test(X_val, y_val, seed=i)

    X_train = impute(X_train).reshape(-1,1)
    X_test = impute(X_test).reshape(-1, 1)

    w_hat,cost_hist = batch_gradient_descent(X_train, y_train)

    yhat_test = X_test @ w_hat
    y_pred = np.clip(np.rint(yhat_test), 1, 6).astype(int)

    test_acc = np.mean(y_pred == y_test)

    avg_acc += test_acc

avg_acc /= (seed_max - seed_min)

print(f'Test Average Accuracy: %{avg_acc * 100:.2f}')


Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
Completed in 5 iterations.
C

## Report analysis (Predicting disease with Age using Batch Gradient Decent):
The model used here to generate the prediction is the batch gradient decent.
I split the data up 70/30 where I trained with 70% of the data set and I tested using 30% of the data set. The model
attempted to use Age as a means to predict the type of disease, however, as you may see in the output, the accuracy
is averages around %22.46, meaning that this is a horrible model to use for this specific data set. The parameters I
set for the gradient decent is alpha=0.001, epsilon=1e-4, max_iters=100000. I found that with alpha values any higher
 than 0.001, the model doesn't converge.

After adjusting the max number of iterations from 100000 to 5, I found that the average accuracy score went up to %28
.65, however, that is still within the realms of the model just making random guesses.
