In [1]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklvq import GMLVQ
from timeit import timeit
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import (
    cross_val_score,
    RepeatedKFold,
)


matplotlib.rc("xtick", labelsize="small")
matplotlib.rc("ytick", labelsize="small")

In [2]:
def getdata():
    data = pd.read_csv("feature_vectors.csv").to_numpy()
    labels = pd.read_csv("diagnosis_label.csv").to_numpy().squeeze()
    labelscenter = pd.read_csv("center_label.csv").to_numpy().squeeze()

    labelsfinal = labels + labelscenter
    
    # better way to perform this
    labelsdiseases1 = np.where(labelsfinal == 'HCUMCG') 
    centerlabels1 = labelsfinal[labelsdiseases1]
    labelsdiseases2 = np.where(labelsfinal == 'HCUGOSM') 
    #labelsdiseases = np.where(labels == 'HCCUN','1',centerlabels) 
    centerlabels2 = labelsfinal[labelsdiseases2]
    labelsdiseases3 = np.where(labelsfinal == 'HCCUN') 
    centerlabels3 = labelsfinal[labelsdiseases3]
    
    # better way to perform this
    centerlabels = np.concatenate((centerlabels1,centerlabels2,centerlabels3))
    centerdata = data[labelsdiseases1]
    centerdata = np.concatenate((centerdata,data[labelsdiseases2]))
    centerdata = np.concatenate((centerdata,data[labelsdiseases3]))
    
    
    #data = ztransform(data)
    #centerdata = ztransform(centerdata)
    return data,labels,centerdata,centerlabels

In [3]:
def ztransform(data_ztransform):
    # Sklearn's standardscaler to perform z-transform
    scaler = StandardScaler()

    # Compute (fit) and apply (transform) z-transform
    # why do we do an inverse z transform
    data_ztransform = scaler.fit_transform(data_ztransform)
    
    return data_ztransform

In [4]:
def model_definition():
    
    model = GMLVQ(
    distance_type="adaptive-squared-euclidean",
    activation_type="sigmoid",
    activation_params={"beta": 2},
    solver_type="sgd",
    solver_params={"max_runs": 20,"batch_size":1,"step_size": np.array([0.1, 0.05])},
    random_state=1428,)
    
    return model

In [5]:
def train_modelkfold(data, label, folds=10):
    modellist = list()
    accuracies = list()
    x_dlist = list()
    y_dlist = list()
    transformed_modell = list()
    transformed_datal = list()
    scaler = StandardScaler()
    modelmatrix = np.zeros((folds,folds),dtype=object) 
    accuracies = np.zeros((folds,folds),dtype=float) 
    transformed_model_matrix = np.zeros((folds,folds),dtype=object) 
    transformed_data_matrix = np.zeros((folds,folds),dtype=int)
    
    for repeated in range(2):
        print("========Repeated fold number",str(repeated),"========")
        kfold = KFold(folds, shuffle=True)
        for k, (training_indices, testing_indices) in enumerate(kfold.split(data)):
            model = model_definition()
            accuracy = 0
            correct = 0
            trainX, trainY, testX, testY = data[training_indices], label[training_indices], data[testing_indices], label[testing_indices]
            print(np.shape(trainX))
            model.fit(trainX, trainY)

            #validation accuracy
            # could use model.predict but it will only give the fraction of the correctly predicted labels
            #_, accuraccy = model.evaluate(testX, testY)
            predictedlabels = model.predict(testX)

            # since no model.evaluate.....

            for i in range(len(predictedlabels)):
                if(predictedlabels[i]==testY[i]):
                    correct = correct+1

            accuracy = correct/len(testY)
            print('%.3f accuracy' % (correct/len(testY)))
            

            modelmatrix[repeated,k] = model
        #histories.append(history_kfold)
    return modelmatrix

In [6]:
def calculate_lambda():
    #average_lambda = np.mean(sum(model.lambda_ for model in modellist),axis=0)
    average_lambda = sum(model.lambda_ for model in modellist)/len(modellist)
        #modellist
    return average_lambda

In [7]:
data,labels,centerdata,centerlabels = getdata()
modelmatrix = train_modelkfold(centerdata,centerlabels)

(73, 36)
1.000 accuracy
(73, 36)
0.889 accuracy
(74, 36)
0.875 accuracy
(74, 36)
1.000 accuracy
(74, 36)
0.750 accuracy
(74, 36)
0.875 accuracy
(74, 36)
1.000 accuracy
(74, 36)
1.000 accuracy
(74, 36)
1.000 accuracy
(74, 36)
0.875 accuracy
(73, 36)
0.889 accuracy
(73, 36)
1.000 accuracy
(74, 36)
0.750 accuracy
(74, 36)
0.875 accuracy
(74, 36)
0.875 accuracy
(74, 36)
0.750 accuracy
(74, 36)
0.875 accuracy
(74, 36)
1.000 accuracy
(74, 36)
0.875 accuracy
(74, 36)
0.875 accuracy


In [None]:
average_lambda = calculate_lambda()

type(average_lambda)

# Eigen value decompostion of the average lambda to derive eigen values and eigen vectors

In [None]:
def eigendecomposition(average_lambda):
    eigenvalues, eigenvectors = np.linalg.eigh(average_lambda)
    # Flip (reverse the order to descending) before assigning.
    eigenvalues = np.flip(eigenvalues)

    # eigenvectors are column matrix in ascending order. Flip the columns and transpose the matrix
    # to get the descending ordered row matrix.
    eigenvectors = np.flip(eigenvectors, axis=1).T
    
    return eigenvalues,eigenvectors

In [None]:
eigenvalues,eigenvectors = eigendecomposition(average_lambda)

In [None]:
len(eigenvalues)

In [None]:
feature_names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z','ab','ac','ad','ae','af','ag','ah','ai','aj','ak']


In [None]:
def ploteigenvalues(eigenvalues,eigenvectors):
   # Plot the eigenvalues of the eigenvectors of the relevance matrix.

    %matplotlib inline 
    fig, ax = plt.subplots()
    fig.suptitle("Eigenvalues")
    ax.bar(range(0, len(eigenvalues)), eigenvalues)
    ax.set_ylabel("Weight")
    ax.grid(False)
    plt.savefig('centereigenvalues.png')


    # Plot the first two eigenvectors of the relevance matrix, which  is called `omega_hat`.
    fig, ax = plt.subplots()
    fig.suptitle("First Eigenvector")
    ax.bar(feature_names, eigenvectors[0, :])
    ax.set_ylabel("Weight")
    ax.grid(False)
    plt.savefig('Firstvectorcenter.png')

    
    fig, ax = plt.subplots()
    fig.suptitle("Second Eigenvector")
    ax.bar(feature_names, eigenvectors[1, :])
    ax.set_ylabel("Weight")
    ax.grid(False)
    plt.savefig('Secondvectorcenter.png')

    fig, ax = plt.subplots()
    # The relevance matrix is available after fitting the model.
    relevance_matrix = average_lambda
    fig.suptitle("Relevance Matrix Diagonal Center data")
    ax.bar(feature_names, np.diagonal(relevance_matrix))
    ax.set_ylabel("Weight")
    ax.grid(False)
    plt.savefig('centerrelevance.png')


In [None]:
ploteigenvalues(eigenvalues,eigenvectors)

In [None]:
N=36
K=3
I = np.identity(N)
outerproduct = np.zeros((N,N))   
for i in range(K):
    
    outerproduct = np.outer(eigenvectors[i,:].T,eigenvectors[i,:])

correctionmatrix = I-outerproduct

In [None]:
len(correctionmatrix)

In [None]:
# Sklearn's standardscaler to perform z-transform
scaler = StandardScaler()

# Compute (fit) and apply (transform) z-transform
centerdata = scaler.fit_transform(centerdata)

# The creation of the model object used to fit the data to.
model1 = GMLVQ(
    distance_type="adaptive-squared-euclidean",
    activation_type="sigmoid",
    activation_params={"beta": 2},
    solver_type="sgd",
    solver_params={"max_runs": 20,"batch_size":1,"step_size": np.array([0.1, 0.05])},
    random_state=1428,
    relevance_correction=correctionmatrix
)

In [None]:

# Train the model using the scaled data and true labels
model1.fit(data, labels)

# Predict the labels using the trained model
predicted_labels = model1.predict(data)
# To get a sense of the training performance we could print the classification report.
#print(classification_report(labelssubclass, predicted_labels))

In [None]:
# trained omega
omegad = model1.omega_

In [None]:
# Plot the eigenvalues of the eigenvectors of the relevance matrix.
fig, ax = plt.subplots()
fig.suptitle("Eigenvalues")
ax.bar(range(0, len(model1.eigenvalues_)), model1.eigenvalues_)
ax.set_ylabel("Weight")
ax.grid(False)
plt.savefig('diseaseeigenvalues.png')


# Plot the first two eigenvectors of the relevance matrix, which  is called `omega_hat`.
fig, ax = plt.subplots()
fig.suptitle("First Eigenvector")
ax.bar(feature_names, model1.omega_hat_[0, :])
ax.set_ylabel("Weight")
ax.grid(False)
plt.savefig('Firstvectordisease.png')


fig, ax = plt.subplots()
fig.suptitle("Second Eigenvector")
ax.bar(feature_names, model1.omega_hat_[1, :])
ax.set_ylabel("Weight")
ax.grid(False)
plt.savefig('Secondvectordisease.png')


fig, ax = plt.subplots()
# The relevance matrix is available after fitting the model.
relevance_matrix = model1.lambda_
fig.suptitle("Relevance Matrix Diagonal Center data")
ax.bar(feature_names, np.diagonal(relevance_matrix))
ax.set_ylabel("Weight")
ax.grid(False)
plt.savefig('disease123.png')

In [None]:
# removing the z transform
data = scaler.inverse_transform(data)
transformed_data = model1.transform(data, scale=True)

x_d = transformed_data[:, 0]
y_d = transformed_data[:, 1]

# Transform the model, i.e., the prototypes (scaled by square root of eigenvalues "scale = True")
# prototype inverser transform
prototypes = scaler.inverse_transform(model1.prototypes_)

transformed_model1 = model1.transform(prototypes, scale=True)

print(len(model1.prototypes_))
x_m = transformed_model1[:, 0]
y_m = transformed_model1[:, 1]

fig, ax = plt.subplots()
fig.suptitle("Maindataset disease data with corresponding prototypes")
colors = ['yellow','Magenta','brown']#,'pink','lightgreen']
for i, cls in enumerate(model1.classes_):
    ii = cls == labels
    ax.scatter(
        x_d[ii],
        y_d[ii],
        c=colors[i],
        s=100,
        alpha=0.7,
        edgecolors="white",
        label=model1.classes_[model1.prototypes_labels_[i]],
    )
ax.scatter(x_m, y_m, c=colors, s=180, alpha=0.8, edgecolors="black", linewidth=2.0)
ax.set_xlabel("First eigenvector")
ax.set_ylabel("Second eigenvector")
ax.legend()
ax.grid(True)
print(model1.classes_)
plt.savefig('disease.png')
#plt.savefig('destination_path23.eps', format='eps')
print('A1: Black, A2: Red, B1: Green, B2: Blue, Centers A and B, and diseases 1 and 2.) ')

In [None]:
print(modellist[1])