### Importing and structuring the data

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Set random seed for reproducibility
np.random.seed(0)

# Load the data from an Excel file
T1 = pd.read_excel("/Users/pegaheizad/Desktop/Ghosh_lab/data_files/sorted_cd73_data.xlsx")

# Define the columns for the different categories
# Metadata: Cols 1, 2, 5, 6, 7, 8, 9, 10, 25, 73, 138-144
# Geometric: Cols 3, 4, 11-24; Zernike Moments: Cols 28-57
# Intensity: Cols 58-72; Radial Distribution: Cols 75-85
# Texture: Cols 86-137
# Exclude Metadata, keep others
# T2 = pd.concat([T1.iloc[:, 2:4], T1.iloc[:, 10:24], T1.iloc[:, 25:72], T1.iloc[:, 73:137]], axis=1)
T2 = pd.concat([T1.iloc[:, 2:4], T1.iloc[:, 10:24], T1.iloc[:, 25:72], T1.iloc[:, 73:137]], axis=1)

# Convert the table to a NumPy array
T = T2.to_numpy()



### Setting the parameters for the ML

In [6]:
# Define the number of samples for each class
M = 3000
N = 3500

# Create label vector: 1 for Class 1, 0 for Class 2
t = np.concatenate([np.ones(M), np.zeros(N)])

# Explanatory variables: Zernike Moments (assumed to be columns 19-48)
X = T[:, 18:48]

# Ensure the dataset size matches the labels
assert X.shape[0] == len(t), "Mismatch between data and label lengths."

# Define the number of folds for cross-validation
kfold = 10
kf = KFold(n_splits=kfold, shuffle=True, random_state=0)

# Define the number of nodes in the hidden layer
nNodes = 40

# Initialize confusion matrix and lists for predictions
totcon = np.zeros((2, 2))  
totpred = np.array([])     
totytest = np.array([])    
models = []
acc = []


### Training the ML

In [7]:
# Train-test loop
for train_index, test_index in kf.split(X):
    # Split the data into training and test sets
    xtrain, xtest = X[train_index], X[test_index]
    ytrain, ytest = t[train_index], t[test_index]

    # Convert labels to categorical (for binary classification)
    ytrain_cat = to_categorical(ytrain, 2)
    ytest_cat = to_categorical(ytest, 2)

    # Build the neural network model
    model = Sequential()
    model.add(Dense(nNodes, input_shape=(X.shape[1],), activation='relu'))
    model.add(Dense(2, activation='softmax'))  # 2 output classes

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model (90% train, 10% validation, no test ratio)
    model.fit(xtrain, ytrain_cat, epochs=50, batch_size=32, validation_split=0.1, verbose=0)

    # Store the trained model
    models.append(model)

    # Make predictions on the test set
    pred = model.predict(xtest)
    pred_labels = np.argmax(pred, axis=1)

    # Calculate confusion matrix
    con = confusion_matrix(ytest, pred_labels)
    totcon += con

    # Calculate accuracy for this fold
    acc.append(accuracy_score(ytest, pred_labels) * 100)

    # Store predictions and true labels for confusion matrix plotting
    totpred = np.concatenate([totpred, pred_labels])
    totytest = np.concatenate([totytest, ytest])

# Print the average accuracy over all folds
print(f"Average accuracy over {kfold} folds: {np.mean(acc)}%")

# Optional: save the models and accuracy data
# np.save("models.npy", models)
# np.savetxt("accuracy.csv", acc, delimiter=",")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 600us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 621us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 625us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 663us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 598us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 609us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 603us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 588us/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 605us/step
Average accuracy over 10 folds: 81.78461538461538%
