In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.utils import shuffle

In [2]:
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped

In [3]:
def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

In [4]:
def compute_cost(W, X, Y):
    # calculate hinge loss
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss = regularization_strength * (np.sum(distances) / N)
    # calculate cost
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost


In [5]:
def calculate_cost_gradient(W, X_batch, Y_batch):

    if type(Y_batch) == np.float64:
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch])  

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # average
    return dw
 

In [6]:
def sgd(features, outputs, learning_rate, regularization_strength):
    max_epochs = 5000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01  # in percent
    # stochastic gradient descent
    for epoch in range(1, max_epochs):
        # shuffle to prevent repeating update cycles
        X, Y = shuffle(features, outputs)
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind])
            weights = weights - (learning_rate * ascent)
        # convergence check on 2^nth epoch
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs)
            print("Epoch is: {} and Cost is: {}".format(epoch, cost))
            # stoppage criterion
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights


In [7]:
# set hyper-parameters and call init
regularization_strength = 10000
learning_rate = 0.000001


In [8]:
class init:
    print("reading dataset...")
    # read data in pandas (pd) data frame
    data = pd.read_csv(r'C:\Users\Nandhini R\OneDrive\Desktop\Mini Project\Breast Cancer\data.csv')
    # drop last column (extra column added by pd)
    # and unnecessary first column (id)
    data.drop(data.columns[[-1, 0]], axis=1, inplace=True)
    print("applying feature engineering...")
    # convert categorical labels to numbers
    diag_map = {'M': 1.0, 'B': -1.0}
    data['diagnosis'] = data['diagnosis'].map(diag_map)
    # put features & outputs in different data frames
    Y = data.loc[:, 'diagnosis']
    X = data.iloc[:, 1:]
    # filter features
    remove_correlated_features(X)
    remove_less_significant_features(X, Y)
    # normalize data for better convergence and to prevent overflow
    X_normalized = MinMaxScaler().fit_transform(X.values)
    X = pd.DataFrame(X_normalized)
    # insert 1 in every row for intercept b
    X.insert(loc=len(X.columns), column='intercept', value=1)
    # split data into train and test set
    print("splitting dataset into train and test sets...")
    X_train, X_test, y_train, y_test = tts(X, Y, test_size=0.2, random_state=42)
    # train the model
    print("training started...")
    W = sgd(X_train.to_numpy(), y_train.to_numpy(),learning_rate, regularization_strength)
    print("training finished.")
    print("weights are: {}".format(W))
    # testing the model
    print("testing the model...")
    y_train_predicted = np.array([])
    for i in range(X_train.shape[0]):
        yp = np.sign(np.dot(X_train.to_numpy()[i], W))
        y_train_predicted = np.append(y_train_predicted, yp)
    y_test_predicted = np.array([])
    for i in range(X_test.shape[0]):
        yp = np.sign(np.dot(X_test.to_numpy()[i], W))
        y_test_predicted = np.append(y_test_predicted, yp)
    print("accuracy on test dataset: {}".format(accuracy_score(y_test, y_test_predicted)))
    print("recall on test dataset: {}".format(recall_score(y_test, y_test_predicted)))
    print("precision on test dataset: {}".format(recall_score(y_test, y_test_predicted)))


reading dataset...
applying feature engineering...
splitting dataset into train and test sets...
training started...
Epoch is: 1 and Cost is: 7079.408849887995
Epoch is: 2 and Cost is: 6276.279799249498
Epoch is: 4 and Cost is: 4917.125766557726
Epoch is: 8 and Cost is: 3449.819625386366
Epoch is: 16 and Cost is: 2526.0616379400058
Epoch is: 32 and Cost is: 1979.0151521368525
Epoch is: 64 and Cost is: 1558.15411955798
Epoch is: 128 and Cost is: 1301.911681783263
Epoch is: 256 and Cost is: 1151.4668099396583
Epoch is: 512 and Cost is: 1073.8824468974717
Epoch is: 1024 and Cost is: 1042.5508489749907
Epoch is: 2048 and Cost is: 1028.3196348984927
Epoch is: 4096 and Cost is: 1024.320693798405
training finished.
weights are: [ 3.86714928  9.11560226 -2.32093915 -6.09472852 11.52299979 -1.64937459
 -9.67340106  1.84903449 -1.55811656  3.09368269  4.82044275  5.2844758
 -5.06446508]
testing the model...
accuracy on test dataset: 0.9649122807017544
recall on test dataset: 0.9302325581395349
p

In [9]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score
def k_fold_cross_validation(X, y, k, learning_rate, regularization_strength):
    kf = KFold(n_splits=k)
    accuracy_scores = []
    recall_scores = []
    precision_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # train the model on training set
        W = sgd(X_train, y_train, learning_rate, regularization_strength)

        # predict on training set
        y_train_predicted = np.array([])
        for i in range(X_train.shape[0]):
            yp = np.sign(np.dot(X_train[i], W))
            y_train_predicted = np.append(y_train_predicted, yp)

        # predict on test set
        y_test_predicted = np.array([])
        for i in range(X_test.shape[0]):
            yp = np.sign(np.dot(X_test[i], W))
            y_test_predicted = np.append(y_test_predicted, yp)

        # evaluate model on accuracy, recall and precision
        accuracy = accuracy_score(y_test, y_test_predicted)
        recall = recall_score(y_test, y_test_predicted)
        precision = precision_score(y_test, y_test_predicted)

        accuracy_scores.append(accuracy)
        recall_scores.append(recall)
        precision_scores.append(precision)

    print("Average accuracy score: {}".format(np.mean(accuracy_scores)))
    print("Average recall score: {}".format(np.mean(recall_scores)))
    print("Average precision score: {}".format(np.mean(precision_scores)))


In [10]:
regularization_strength = 10000
learning_rate = 0.000001
k = 10
model = init()
k_fold_cross_validation(model.X.to_numpy(), model.Y.to_numpy(), k, learning_rate, regularization_strength)

Epoch is: 1 and Cost is: 6153.45921729244
Epoch is: 2 and Cost is: 5516.849346967191
Epoch is: 4 and Cost is: 4437.26060611007
Epoch is: 8 and Cost is: 3164.0102986609277
Epoch is: 16 and Cost is: 2292.341289120065
Epoch is: 32 and Cost is: 1754.1770322940934
Epoch is: 64 and Cost is: 1386.283121185798
Epoch is: 128 and Cost is: 1153.8501542698884
Epoch is: 256 and Cost is: 1052.2882045498736
Epoch is: 512 and Cost is: 989.5516328096044
Epoch is: 1024 and Cost is: 959.3819197657274
Epoch is: 2048 and Cost is: 952.2964765984168
Epoch is: 1 and Cost is: 7054.30360639495
Epoch is: 2 and Cost is: 6007.466162293285
Epoch is: 4 and Cost is: 4436.516232420734
Epoch is: 8 and Cost is: 3070.774129400257
Epoch is: 16 and Cost is: 2257.7865427411693
Epoch is: 32 and Cost is: 1801.103002701914
Epoch is: 64 and Cost is: 1424.311861779004
Epoch is: 128 and Cost is: 1204.9672005221396
Epoch is: 256 and Cost is: 1057.5641041750814
Epoch is: 512 and Cost is: 971.9616583699693
Epoch is: 1024 and Cost is