In [26]:
import numpy as np
from scipy.special import softmax
from sklearn.preprocessing import StandardScaler

In [27]:
train = np.loadtxt('Assignment1.2/train2.csv', delimiter=',', skiprows=1)
test = np.loadtxt('Assignment1.2/test2.csv', delimiter=',', skiprows=1)

In [28]:
print(train.shape)
print(test.shape)

(87595, 26)
(58397, 25)


In [29]:
# tell the count of unique values in each column of train and test
train_unique = []
for i in range(train.shape[1]):
    train_unique.append(len(np.unique(train[:, i])))
    # print(i, len(np.unique(train[:, i])))

# print()
test_unique = []
for i in range(test.shape[1]):
    test_unique.append(len(np.unique(test[:, i])))
    # print(i, len(np.unique(test[:, i])))

In [5]:
print(train[:10,7])  # total costs
print(train[:10,10]) # length of stay
print(train[:10,23]) # birth weight

[43612.44 30877.11 24295.14 24335.61 32814.89 17150.43 74122.19 37730.52
 11335.41 13891.18]
[11. 13.  2. 12.  9.  2. 18. 17.  4.  1.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [5]:
# sum the values in train_unique and test_unique
train_sum = sum(train_unique) - train_unique[7] - train_unique[10] - train_unique[23] - train_unique[25] - train_unique[13] - train_unique[14] - train_unique[15]
test_sum = sum(test_unique) - test_unique[7] - test_unique[10] - test_unique[23] - test_unique[13] - test_unique[14] - test_unique[15]
print(train_sum, test_sum)

737 728


In [31]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler

columns_to_one_hot_encode = [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 16, 17, 18, 19, 20, 21, 22, 24]
column_names_for_one_hot_encode = [
    "Hospital Service Area",
    "Hospital County",
    "Operating Certificate Number",
    "Permanent Facility Id",
    "Facility Name",
    "Age Group",
    "Zip Code - 3 digits",
    "Race",
    "Ethnicity",
    "Type of Admission",
    "Patient Disposition",
    "APR MDC Code",
    "APR Severity of Illness Description",
    "APR Risk of Mortality",
    "APR Medical Surgical Description",
    "Payment Typology 1",
    "Payment Typology 2",
    "Payment Typology 3",
    "Emergency Department Indicator"
]

columns_to_target_encode = [13, 14, 15]
column_names_for_target_encode = [
    "CCSR Diagnosis Code",
    "CCSR Procedure Code",
    "APR DRG Code",
]

numerical_columns = [7, 10, 23]
numerical_column_names = [
    "Total Costs",
    "Length of Stay",
    "Birth Weight",
]

def target_encode(train, Y, columns):
    target_encoded_columns = []
    target_encoded_column_names = []
    target_encoders = {}
    for col in columns:
        unique_values = np.unique(train[:, col])
        mean_encoded = {val: np.mean(Y[train[:, col] == val]) for val in unique_values}
        target_encoders[col] = mean_encoded
        target_encoded_columns.append(np.vectorize(mean_encoded.get)(train[:, col]))
        target_encoded_column_names.append([f"{column_names_for_target_encode[columns.index(col)]}_{val}" for val in unique_values])
    return np.column_stack(target_encoded_columns), target_encoded_column_names, target_encoders

def encode(data, Y, encoder=None, target_encoder=None):
    if encoder is None:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_columns = encoder.fit_transform(data[:, columns_to_one_hot_encode])
    else:
        encoded_columns = encoder.transform(data[:, columns_to_one_hot_encode])
        # Handle unknown categories in test set
        for i, categories in enumerate(encoder.categories_):
            col = data[:, columns_to_one_hot_encode[i]]
            unknown_mask = ~np.isin(col, categories)
            if np.any(unknown_mask):
                encoded_columns[unknown_mask, sum(len(c) for c in encoder.categories_[:i]):sum(len(c) for c in encoder.categories_[:i+1])] = 0
    
    non_encoded_columns = data[:, numerical_columns]
    
    if target_encoder is None:
        target_encoded_columns, target_encoded_column_names, target_encoders = target_encode(data, Y, columns_to_target_encode)
    else:
        target_encoded_columns = []
        target_encoded_column_names = target_encoder['column_names']
        target_encoders = target_encoder['encoders']
        for col in columns_to_target_encode:
            col_data = data[:, col]
            mean_encoded = target_encoders[col]
            encoded_col = np.vectorize(mean_encoded.get)(col_data)
            unknown_mask = ~np.isin(col_data, list(mean_encoded.keys()))
            encoded_col[unknown_mask] = 0
            target_encoded_columns.append(encoded_col)
        target_encoded_columns = np.column_stack(target_encoded_columns)
    
    encoded = np.hstack((encoded_columns, target_encoded_columns, non_encoded_columns))
    
    # Generate column names for the encoded columns
    encoded_column_names = []
    for i, categories in enumerate(encoder.categories_):
        col_name = column_names_for_one_hot_encode[i]
        encoded_column_names.extend([f"{col_name}_{category}" for category in categories])
    
    encoded_column_names.extend([name for sublist in target_encoded_column_names for name in sublist])
    encoded_column_names.extend(numerical_column_names)
    
    return encoded, encoded_column_names, encoder, {'column_names': target_encoded_column_names, 'encoders': target_encoders}

def generate_data(train, test):
    X_train = train[:, :-1]
    Y_train = train[:, -1]
    Y_train = (Y_train + 1) / 2
    Y_train = Y_train.astype(np.int64)

    # Replace zero values in the Birth Weight column with the mean of the remaining entries
    birth_weight_col = X_train[:, 23]
    non_zero_birth_weights = birth_weight_col[birth_weight_col != 0]
    mean_birth_weight = np.mean(non_zero_birth_weights)
    birth_weight_col[birth_weight_col == 0] = mean_birth_weight
    X_train[:, 23] = birth_weight_col
    
    X_train, encoded_column_names, encoder, target_encoder = encode(X_train, Y_train)
    X_train = X_train.astype(np.float64)
    
    birth_weight_col = test[:, 23]
    birth_weight_col[birth_weight_col == 0] = mean_birth_weight
    test[:, 23] = birth_weight_col
    unique, counts = np.unique(Y_train, return_counts=True)
    W = np.zeros((X_train.shape[1]+1, len(unique)), dtype=np.float64)

    X_test, _, _, _ = encode(test, Y_train, encoder, target_encoder)
    X_test = X_test.astype(np.float64)

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    one = np.ones((X_train.shape[0], 1))
    X_train = np.hstack((one, X_train))
    
    one = np.ones((X_test.shape[0], 1))
    X_test = np.hstack((one, X_test))

    return X_train, Y_train, W, X_test, counts

X, Y, W, X_test, counts, encoded_column_names = generate_data(train, test)
print(encoded_column_names)

['Hospital Service Area_1.0', 'Hospital Service Area_2.0', 'Hospital Service Area_3.0', 'Hospital Service Area_4.0', 'Hospital Service Area_5.0', 'Hospital Service Area_6.0', 'Hospital Service Area_7.0', 'Hospital Service Area_8.0', 'Hospital County_1.0', 'Hospital County_2.0', 'Hospital County_3.0', 'Hospital County_4.0', 'Hospital County_5.0', 'Hospital County_6.0', 'Hospital County_7.0', 'Hospital County_8.0', 'Hospital County_9.0', 'Hospital County_10.0', 'Hospital County_11.0', 'Hospital County_12.0', 'Hospital County_13.0', 'Hospital County_14.0', 'Hospital County_15.0', 'Hospital County_17.0', 'Hospital County_18.0', 'Hospital County_19.0', 'Hospital County_20.0', 'Hospital County_21.0', 'Hospital County_22.0', 'Hospital County_23.0', 'Hospital County_24.0', 'Hospital County_25.0', 'Hospital County_26.0', 'Hospital County_27.0', 'Hospital County_28.0', 'Hospital County_29.0', 'Hospital County_30.0', 'Hospital County_31.0', 'Hospital County_32.0', 'Hospital County_33.0', 'Hospita

In [32]:
print(X.shape)
print(Y.shape)
print(X_test.shape)

(87595, 744)
(87595,)
(58397, 744)


In [33]:
def g(W,x,j):
    z = np.dot(x,W)
    sm= softmax(z)
    return sm[j]

def loss(X,Y,W,counts):
    n = X.shape[0]
    Z = X @ W
    softmax_probs = softmax(Z, axis=1)
    indices = (np.arange(n), Y)
    correct_class_probs = softmax_probs[indices]
    scaled_probs = np.log(correct_class_probs) / counts[Y]
    loss_value = -np.mean(scaled_probs) / 2
    return loss_value

def compute_gradient(X, Y, W, counts):
    n, m = X.shape
    k = W.shape[1]
    z = X @ W  
    softmax_probs = softmax(z, axis=1) 
    indices = (np.arange(n), Y)
    Y_one_hot = np.zeros((n, k))
    Y_one_hot[indices] = 1
    grad_W = X.T @ ((softmax_probs - Y_one_hot) / counts[Y][:, np.newaxis]) / (2 * n) 
    return grad_W

def compute_n(X, Y, W, gradient, n0, counts):
    nl = 0.0
    nh = n0
    prev_loss= loss(X, Y, W, counts)
    while prev_loss > loss(X, Y, W - nh*gradient, counts):
        nh *= 2
        prev_loss= loss(X, Y, W - nh*gradient, counts)
    if nh>n0:
        nl= nh/2
    else:
        while loss(X, Y, W, counts) < loss(X, Y, W - nh*gradient, counts):
            nh /= 2
        nh *= 2
    for _ in range(5):
        n1 = (2*nl + nh)/3
        n2 = (nl + 2*nh)/3
        if loss(X, Y, W - n1*gradient, counts) > loss(X, Y, W - n2*gradient, counts):
            nl = n1
        else:
            nh = n2
    return (nl+nh)/2

def gradient_descent(X, Y, W, counts, n0, epochs, batch_size):
    n = X.shape[0]
    for _ in range(epochs):
        batch_num = 0
        for start in range(0, n, batch_size):
            end = min(start + batch_size, n)
            X_batch = X[start:end]
            Y_batch = Y[start:end]
            batch_loss = loss(X_batch, Y_batch, W, counts)
            print(f"Epoch{_+1}, Batch{1+int(start/batch_size)}, Loss{batch_loss}")

            gradient = compute_gradient(X_batch, Y_batch, W, counts)
            learning_rate= compute_n(X_batch, Y_batch, W, gradient, n0, counts)
            W -= learning_rate * gradient
            batch_num += 1
    return W

print(X.shape, Y.shape, W.shape, counts)
W = gradient_descent(X,Y,W,counts,1e4,25,87595)

(87595, 744) (87595,) (744, 2) [48297 39298]
Epoch1, Batch1, Loss7.913090707916495e-06
Epoch2, Batch1, Loss6.978432936446689e-06
Epoch3, Batch1, Loss6.717537285255176e-06
Epoch4, Batch1, Loss6.591476566032787e-06
Epoch5, Batch1, Loss6.515803726617881e-06
Epoch6, Batch1, Loss6.46447778993629e-06
Epoch7, Batch1, Loss6.426862659891352e-06
Epoch8, Batch1, Loss6.397792231281019e-06
Epoch9, Batch1, Loss6.3744457868644464e-06
Epoch10, Batch1, Loss6.355148359190623e-06
Epoch11, Batch1, Loss6.338840303142261e-06
Epoch12, Batch1, Loss6.324816910420138e-06
Epoch13, Batch1, Loss6.312589963970589e-06
Epoch14, Batch1, Loss6.3018093818636115e-06
Epoch15, Batch1, Loss6.292216593515619e-06
Epoch16, Batch1, Loss6.283615633698692e-06
Epoch17, Batch1, Loss6.275854591136512e-06
Epoch18, Batch1, Loss6.268813342903741e-06
Epoch19, Batch1, Loss6.262395229610478e-06
Epoch20, Batch1, Loss6.256521270897664e-06
Epoch21, Batch1, Loss6.2511260590148205e-06
Epoch22, Batch1, Loss6.246154785439238e-06
Epoch23, Batch1,

In [34]:
Z = X_test @ W
output_model_pred = softmax(Z, axis=1)

In [36]:
print(output_model_pred.shape)
print(output_model_pred[:5])

(58397, 2)
[[0.28720821 0.71279179]
 [0.41624935 0.58375065]
 [0.43775763 0.56224237]
 [0.20031775 0.79968225]
 [0.47093007 0.52906993]]


In [38]:
# take the argmax of the output_model_pred to get the predicted class (converting 0 to -1 also)
predicted_class = np.argmax(output_model_pred, axis=1)
predicted_class[predicted_class == 0] = -1
print(predicted_class[:10])

[ 1  1  1  1  1 -1 -1 -1 -1 -1]


In [39]:
# write the predicted class to a txt file
np.savetxt('output.txt', predicted_class, fmt='%d')